Common.hs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. module Data.SCargot.Common ( -- $intro
  2. -- * Identifier Syntaxes
  3. parseR5RSIdent
  4. , parseR6RSIdent
  5. , parseR7RSIdent
  6. , parseXIDIdentStrict
  7. , parseXIDIdentGeneral
  8. , parseHaskellIdent
  9. , parseHaskellVariable
  10. , parseHaskellConstructor
  11. -- * Numeric Literal Parsers
  12. , signed
  13. , prefixedNumber
  14. , signedPrefixedNumber
  15. , binNumber
  16. , signedBinNumber
  17. , octNumber
  18. , signedOctNumber
  19. , decNumber
  20. , signedDecNumber
  21. , dozNumber
  22. , signedDozNumber
  23. , hexNumber
  24. , signedHexNumber
  25. ) where
  26. #if !MIN_VERSION_base(4,8,0)
  27. import Control.Applicative hiding ((<|>), many)
  28. #endif
  29. import Data.Char
  30. import Data.Text (Text)
  31. import qualified Data.Text as T
  32. import Text.Parsec
  33. import Text.Parsec.Text (Parser)
  34. -- | Parse an identifier according to the R5RS Scheme standard. This
  35. -- will not normalize case, even though the R5RS standard specifies
  36. -- that all identifiers be normalized to lower case first.
  37. --
  38. -- An R5RS identifier is, broadly speaking, alphabetic or numeric
  39. -- and may include various symbols, but no escapes.
  40. parseR5RSIdent :: Parser Text
  41. parseR5RSIdent =
  42. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  43. where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
  44. subsequent = initial <|> digit <|> oneOf "+-.@"
  45. peculiar = string "+" <|> string "-" <|> string "..."
  46. hasCategory :: Char -> [GeneralCategory] -> Bool
  47. hasCategory c cs = generalCategory c `elem` cs
  48. -- | Parse an identifier according to the R6RS Scheme standard. An
  49. -- R6RS identifier may include inline hexadecimal escape sequences
  50. -- so that, for example, @foo@ is equivalent to @f\\x6f;o@, and is
  51. -- more liberal than R5RS as to which Unicode characters it may
  52. -- accept.
  53. parseR6RSIdent :: Parser Text
  54. parseR6RSIdent =
  55. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  56. where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
  57. constituent = letter
  58. <|> uniClass (\ c -> isLetter c ||
  59. isSymbol c ||
  60. hasCategory c
  61. [ NonSpacingMark
  62. , LetterNumber
  63. , OtherNumber
  64. , DashPunctuation
  65. , ConnectorPunctuation
  66. , OtherPunctuation
  67. , PrivateUse
  68. ])
  69. inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
  70. subsequent = initial <|> digit <|> oneOf "+-.@"
  71. <|> uniClass (\ c -> hasCategory c
  72. [ DecimalNumber
  73. , SpacingCombiningMark
  74. , EnclosingMark
  75. ])
  76. peculiar = string "+" <|> string "-" <|> string "..." <|>
  77. ((++) <$> string "->" <*> many subsequent)
  78. uniClass :: (Char -> Bool) -> Parser Char
  79. uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
  80. -- | Parse an identifier according to the R7RS Scheme standard. An
  81. -- R7RS identifier, in addition to a typical identifier format,
  82. -- can also be a chunk of text surrounded by vertical bars that
  83. -- can contain spaces and other characters. Unlike R6RS, it does
  84. -- not allow escapes to be included in identifiers unless those
  85. -- identifiers are surrounded by vertical bars.
  86. parseR7RSIdent :: Parser Text
  87. parseR7RSIdent = T.pack <$>
  88. ( (:) <$> initial <*> many subsequent
  89. <|> char '|' *> many1 symbolElement <* char '|'
  90. <|> peculiar
  91. )
  92. where initial = letter <|> specInit
  93. specInit = oneOf "!$%&*/:<=>?^_~"
  94. subsequent = initial <|> digit <|> specSubsequent
  95. specSubsequent = expSign <|> oneOf ".@"
  96. expSign = oneOf "+-"
  97. symbolElement = noneOf "\\|"
  98. <|> hexEscape
  99. <|> mnemEscape
  100. <|> ('|' <$ string "\\|")
  101. hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
  102. mnemEscape = '\a' <$ string "\\a"
  103. <|> '\b' <$ string "\\b"
  104. <|> '\t' <$ string "\\t"
  105. <|> '\n' <$ string "\\n"
  106. <|> '\r' <$ string "\\r"
  107. peculiar = (:[]) <$> expSign
  108. <|> cons2 <$> expSign <*> signSub <*> many subsequent
  109. <|> cons3 <$> expSign
  110. <*> char '.'
  111. <*> dotSub
  112. <*> many subsequent
  113. <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
  114. dotSub = signSub <|> char '.'
  115. signSub = initial <|> expSign <|> char '@'
  116. cons2 a b cs = a : b : cs
  117. cons3 a b c ds = a : b : c : ds
  118. -- | Parse a Haskell variable identifier: a sequence of alphanumeric
  119. -- characters, underscores, or single quote that begins with a
  120. -- lower-case letter.
  121. parseHaskellVariable :: Parser Text
  122. parseHaskellVariable =
  123. T.pack <$> ((:) <$> small <*> many (small <|>
  124. large <|>
  125. digit' <|>
  126. char '\'' <|>
  127. char '_'))
  128. where small = satisfy isLower
  129. large = satisfy isUpper
  130. digit' = satisfy isDigit
  131. -- | Parse a Haskell constructor: a sequence of alphanumeric
  132. -- characters, underscores, or single quote that begins with an
  133. -- upper-case letter.
  134. parseHaskellConstructor :: Parser Text
  135. parseHaskellConstructor =
  136. T.pack <$> ((:) <$> large <*> many (small <|>
  137. large <|>
  138. digit' <|>
  139. char '\'' <|>
  140. char '_'))
  141. where small = satisfy isLower
  142. large = satisfy isUpper
  143. digit' = satisfy isDigit
  144. -- | Parse a Haskell identifer: a sequence of alphanumeric
  145. -- characters, underscores, or a single quote. This matches both
  146. -- variable and constructor names.
  147. parseHaskellIdent :: Parser Text
  148. parseHaskellIdent =
  149. T.pack <$> ((:) <$> (large <|> small)
  150. <*> many (small <|>
  151. large <|>
  152. digit' <|>
  153. char '\'' <|>
  154. char '_'))
  155. where small = satisfy isLower
  156. large = satisfy isUpper
  157. digit' = satisfy isDigit
  158. -- Ensure that a given character has the given Unicode category
  159. hasCat :: [GeneralCategory] -> Parser Char
  160. hasCat cats = satisfy (flip hasCategory cats)
  161. xidStart :: [GeneralCategory]
  162. xidStart = [ UppercaseLetter
  163. , LowercaseLetter
  164. , TitlecaseLetter
  165. , ModifierLetter
  166. , OtherLetter
  167. , LetterNumber
  168. ]
  169. xidContinue :: [GeneralCategory]
  170. xidContinue = xidStart ++ [ NonSpacingMark
  171. , SpacingCombiningMark
  172. , DecimalNumber
  173. , ConnectorPunctuation
  174. ]
  175. -- | Parse an identifier of unicode characters of the form
  176. -- @<XID_Start> <XID_Continue>*@, which corresponds strongly
  177. -- to the identifiers found in most C-like languages. Note that
  178. -- the @XID_Start@ category does not include the underscore,
  179. -- so @__foo@ is not a valid XID identifier. To parse
  180. -- identifiers that may include leading underscores, use
  181. -- 'parseXIDIdentGeneral'.
  182. parseXIDIdentStrict :: Parser Text
  183. parseXIDIdentStrict = T.pack <$> ((:) <$> hasCat xidStart
  184. <*> many (hasCat xidContinue))
  185. -- | Parse an identifier of unicode characters of the form
  186. -- @(<XID_Start> | '_') <XID_Continue>*@, which corresponds
  187. -- strongly to the identifiers found in most C-like languages.
  188. -- Unlike 'parseXIDIdentStrict', this will also accept an
  189. -- underscore as leading character, which corresponds more
  190. -- closely to programming languages like C and Java, but
  191. -- deviates somewhat from the
  192. -- <http://unicode.org/reports/tr31/ Unicode Identifier and
  193. -- Pattern Syntax standard>.
  194. parseXIDIdentGeneral :: Parser Text
  195. parseXIDIdentGeneral = T.pack <$> ((:) <$> (hasCat xidStart <|> char '_')
  196. <*> many (hasCat xidContinue))
  197. -- | A helper function for defining parsers for arbitrary-base integers.
  198. -- The first argument will be the base, and the second will be the
  199. -- parser for the individual digits.
  200. number :: Integer -> Parser Char -> Parser Integer
  201. number base digits = foldl go 0 <$> many1 digits
  202. where go x d = base * x + toInteger (value d)
  203. value c
  204. | c == 'a' || c == 'A' = 0xa
  205. | c == 'b' || c == 'B' = 0xb
  206. | c == 'c' || c == 'C' = 0xc
  207. | c == 'd' || c == 'D' = 0xd
  208. | c == 'e' || c == 'E' = 0xe
  209. | c == 'f' || c == 'F' = 0xf
  210. | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
  211. | c == '\x218a' = 0xa
  212. | c == '\x218b' = 0xb
  213. | otherwise = error ("Unknown letter in number: " ++ show c)
  214. sign :: Num a => Parser (a -> a)
  215. sign = (pure id <* char '+')
  216. <|> (pure negate <* char '-')
  217. <|> pure id
  218. -- | Given a parser for some kind of numeric literal, this will attempt to
  219. -- parse a leading @+@ or a leading @-@ followed by the numeric literal,
  220. -- and if a @-@ is found, negate that literal.
  221. signed :: Num a => Parser a -> Parser a
  222. signed p = ($) <$> sign <*> p
  223. -- | Parses a number in the same way as 'prefixedNumber', with an optional
  224. -- leading @+@ or @-@.
  225. signedPrefixedNumber :: Parser Integer
  226. signedPrefixedNumber = signed prefixedNumber
  227. -- | Parses a number, determining which numeric base to use by examining
  228. -- the literal's prefix: @0x@ for a hexadecimal number, @0z@ for a
  229. -- dozenal number, @0o@ for an octal number, and @0b@ for a binary
  230. -- number (as well as the upper-case versions of the same.) If the
  231. -- base is omitted entirely, then it is treated as a decimal number.
  232. prefixedNumber :: Parser Integer
  233. prefixedNumber = (string "0x" <|> string "0X") *> hexNumber
  234. <|> (string "0o" <|> string "0O") *> octNumber
  235. <|> (string "0z" <|> string "0Z") *> dozNumber
  236. <|> (string "0b" <|> string "0B") *> binNumber
  237. <|> decNumber
  238. -- | A parser for non-signed binary numbers
  239. binNumber :: Parser Integer
  240. binNumber = number 2 (char '0' <|> char '1')
  241. -- | A parser for signed binary numbers, with an optional leading @+@ or @-@.
  242. signedBinNumber :: Parser Integer
  243. signedBinNumber = signed binNumber
  244. -- | A parser for non-signed octal numbers
  245. octNumber :: Parser Integer
  246. octNumber = number 8 (oneOf "01234567")
  247. -- | A parser for signed octal numbers, with an optional leading @+@ or @-@.
  248. signedOctNumber :: Parser Integer
  249. signedOctNumber = ($) <$> sign <*> octNumber
  250. -- | A parser for non-signed decimal numbers
  251. decNumber :: Parser Integer
  252. decNumber = number 10 digit
  253. -- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.
  254. signedDecNumber :: Parser Integer
  255. signedDecNumber = ($) <$> sign <*> decNumber
  256. dozDigit :: Parser Char
  257. dozDigit = digit <|> oneOf "AaBb\x218a\x218b"
  258. -- | A parser for non-signed duodecimal (dozenal) numbers. This understands both
  259. -- the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)
  260. -- and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@
  261. -- respectively.
  262. dozNumber :: Parser Integer
  263. dozNumber = number 12 dozDigit
  264. -- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.
  265. signedDozNumber :: Parser Integer
  266. signedDozNumber = ($) <$> sign <*> dozNumber
  267. -- | A parser for non-signed hexadecimal numbers
  268. hexNumber :: Parser Integer
  269. hexNumber = number 16 hexDigit
  270. -- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.
  271. signedHexNumber :: Parser Integer
  272. signedHexNumber = ($) <$> sign <*> hexNumber
  273. {- $intro
  274. This module contains a selection of parsers for different kinds of
  275. identifiers and literals, from which more elaborate parsers can be
  276. assembled. These can afford the user a quick way of building parsers
  277. for different atom types.
  278. -}