Common.hs 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. module Data.SCargot.Common ( -- $intro
  2. -- * Lisp Identifier Syntaxes
  3. parseR5RSIdent
  4. , parseR6RSIdent
  5. , parseR7RSIdent
  6. -- * Numeric Literal Parsers
  7. , signed
  8. , prefixedNumber
  9. , signedPrefixedNumber
  10. , binNumber
  11. , signedBinNumber
  12. , octNumber
  13. , signedOctNumber
  14. , decNumber
  15. , signedDecNumber
  16. , dozNumber
  17. , signedDozNumber
  18. , hexNumber
  19. , signedHexNumber
  20. ) where
  21. #if !MIN_VERSION_base(4,8,0)
  22. import Control.Applicative hiding ((<|>), many)
  23. #endif
  24. import Data.Char
  25. import Data.Text (Text)
  26. import qualified Data.Text as T
  27. import Text.Parsec
  28. import Text.Parsec.Text (Parser)
  29. -- | Parse an identifier according to the R5RS Scheme standard. This
  30. -- will not normalize case, even though the R5RS standard specifies
  31. -- that all identifiers be normalized to lower case first.
  32. --
  33. -- An R5RS identifier is, broadly speaking, alphabetic or numeric
  34. -- and may include various symbols, but no escapes.
  35. parseR5RSIdent :: Parser Text
  36. parseR5RSIdent =
  37. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  38. where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
  39. subsequent = initial <|> digit <|> oneOf "+-.@"
  40. peculiar = string "+" <|> string "-" <|> string "..."
  41. hasCategory :: Char -> [GeneralCategory] -> Bool
  42. hasCategory c cs = generalCategory c `elem` cs
  43. -- | Parse an identifier according to the R6RS Scheme standard. An
  44. -- R6RS identifier may include inline hexadecimal escape sequences
  45. -- so that, for example, @foo@ is equivalent to @f\\x6f;o@, and is
  46. -- more liberal than R5RS as to which Unicode characters it may
  47. -- accept.
  48. parseR6RSIdent :: Parser Text
  49. parseR6RSIdent =
  50. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  51. where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
  52. constituent = letter
  53. <|> uniClass (\ c -> isLetter c ||
  54. isSymbol c ||
  55. hasCategory c
  56. [ NonSpacingMark
  57. , LetterNumber
  58. , OtherNumber
  59. , DashPunctuation
  60. , ConnectorPunctuation
  61. , OtherPunctuation
  62. , PrivateUse
  63. ])
  64. inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
  65. subsequent = initial <|> digit <|> oneOf "+-.@"
  66. <|> uniClass (\ c -> hasCategory c
  67. [ DecimalNumber
  68. , SpacingCombiningMark
  69. , EnclosingMark
  70. ])
  71. peculiar = string "+" <|> string "-" <|> string "..." <|>
  72. ((++) <$> string "->" <*> many subsequent)
  73. uniClass :: (Char -> Bool) -> Parser Char
  74. uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
  75. -- | Parse an identifier according to the R7RS Scheme standard. An
  76. -- R7RS identifier, in addition to a typical identifier format,
  77. -- can also be a chunk of text surrounded by vertical bars that
  78. -- can contain spaces and other characters. Unlike R6RS, it does
  79. -- not allow escapes to be included in identifiers unless those
  80. -- identifiers are surrounded by vertical bars.
  81. parseR7RSIdent :: Parser Text
  82. parseR7RSIdent = T.pack <$>
  83. ( (:) <$> initial <*> many subsequent
  84. <|> char '|' *> many1 symbolElement <* char '|'
  85. <|> peculiar
  86. )
  87. where initial = letter <|> specInit
  88. specInit = oneOf "!$%&*/:<=>?^_~"
  89. subsequent = initial <|> digit <|> specSubsequent
  90. specSubsequent = expSign <|> oneOf ".@"
  91. expSign = oneOf "+-"
  92. symbolElement = noneOf "\\|"
  93. <|> hexEscape
  94. <|> mnemEscape
  95. <|> ('|' <$ string "\\|")
  96. hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
  97. mnemEscape = '\a' <$ string "\\a"
  98. <|> '\b' <$ string "\\b"
  99. <|> '\t' <$ string "\\t"
  100. <|> '\n' <$ string "\\n"
  101. <|> '\r' <$ string "\\r"
  102. peculiar = (:[]) <$> expSign
  103. <|> cons2 <$> expSign <*> signSub <*> many subsequent
  104. <|> cons3 <$> expSign
  105. <*> char '.'
  106. <*> dotSub
  107. <*> many subsequent
  108. <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
  109. dotSub = signSub <|> char '.'
  110. signSub = initial <|> expSign <|> char '@'
  111. cons2 a b cs = a : b : cs
  112. cons3 a b c ds = a : b : c : ds
  113. -- | A helper function for defining parsers for arbitrary-base integers.
  114. -- The first argument will be the base, and the second will be the
  115. -- parser for the individual digits.
  116. number :: Integer -> Parser Char -> Parser Integer
  117. number base digits = foldl go 0 <$> many1 digits
  118. where go x d = base * x + toInteger (value d)
  119. value c
  120. | c == 'a' || c == 'A' = 0xa
  121. | c == 'b' || c == 'B' = 0xb
  122. | c == 'c' || c == 'C' = 0xc
  123. | c == 'd' || c == 'D' = 0xd
  124. | c == 'e' || c == 'E' = 0xe
  125. | c == 'f' || c == 'F' = 0xf
  126. | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
  127. | c == '\x218a' = 0xa
  128. | c == '\x218b' = 0xb
  129. | otherwise = error ("Unknown letter in number: " ++ show c)
  130. sign :: Num a => Parser (a -> a)
  131. sign = (pure id <* char '+')
  132. <|> (pure negate <* char '-')
  133. <|> pure id
  134. -- | Given a parser for some kind of numeric literal, this will attempt to
  135. -- parse a leading @+@ or a leading @-@ followed by the numeric literal,
  136. -- and if a @-@ is found, negate that literal.
  137. signed :: Num a => Parser a -> Parser a
  138. signed p = ($) <$> sign <*> p
  139. -- | Parses a number in the same way as 'prefixedNumber', with an optional
  140. -- leading @+@ or @-@.
  141. signedPrefixedNumber :: Parser Integer
  142. signedPrefixedNumber = signed prefixedNumber
  143. -- | Parses a number, determining which numeric base to use by examining
  144. -- the literal's prefix: @0x@ for a hexadecimal number, @0z@ for a
  145. -- dozenal number, @0o@ for an octal number, and @0b@ for a binary
  146. -- number (as well as the upper-case versions of the same.) If the
  147. -- base is omitted entirely, then it is treated as a decimal number.
  148. prefixedNumber :: Parser Integer
  149. prefixedNumber = (string "0x" <|> string "0X") *> hexNumber
  150. <|> (string "0o" <|> string "0O") *> octNumber
  151. <|> (string "0z" <|> string "0Z") *> dozNumber
  152. <|> (string "0b" <|> string "0B") *> binNumber
  153. <|> decNumber
  154. -- | A parser for non-signed binary numbers
  155. binNumber :: Parser Integer
  156. binNumber = number 2 (char '0' <|> char '1')
  157. -- | A parser for signed binary numbers, with an optional leading @+@ or @-@.
  158. signedBinNumber :: Parser Integer
  159. signedBinNumber = signed binNumber
  160. -- | A parser for non-signed octal numbers
  161. octNumber :: Parser Integer
  162. octNumber = number 8 (oneOf "01234567")
  163. -- | A parser for signed octal numbers, with an optional leading @+@ or @-@.
  164. signedOctNumber :: Parser Integer
  165. signedOctNumber = ($) <$> sign <*> octNumber
  166. -- | A parser for non-signed decimal numbers
  167. decNumber :: Parser Integer
  168. decNumber = number 10 digit
  169. -- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.
  170. signedDecNumber :: Parser Integer
  171. signedDecNumber = ($) <$> sign <*> decNumber
  172. dozDigit :: Parser Char
  173. dozDigit = digit <|> oneOf "AaBb\x218a\x218b"
  174. -- | A parser for non-signed duodecimal (dozenal) numbers. This understands both
  175. -- the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)
  176. -- and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@
  177. -- respectively.
  178. dozNumber :: Parser Integer
  179. dozNumber = number 12 dozDigit
  180. -- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.
  181. signedDozNumber :: Parser Integer
  182. signedDozNumber = ($) <$> sign <*> dozNumber
  183. -- | A parser for non-signed hexadecimal numbers
  184. hexNumber :: Parser Integer
  185. hexNumber = number 16 hexDigit
  186. -- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.
  187. signedHexNumber :: Parser Integer
  188. signedHexNumber = ($) <$> sign <*> hexNumber
  189. {- $intro
  190. This module contains a selection of parsers for different kinds of
  191. identifiers and literals, from which more elaborate parsers can be
  192. assembled. These can afford the user a quick way of building parsers
  193. for different atom types.
  194. -}