Common.hs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. module Data.SCargot.Common ( -- $intro
  2. -- * Identifier Syntaxes
  3. parseR5RSIdent
  4. , parseR6RSIdent
  5. , parseR7RSIdent
  6. , parseXIDIdentStrict
  7. , parseXIDIdentGeneral
  8. , parseHaskellIdent
  9. , parseHaskellVariable
  10. , parseHaskellConstructor
  11. -- * Numeric Literal Parsers
  12. , signed
  13. , prefixedNumber
  14. , signedPrefixedNumber
  15. , binNumber
  16. , signedBinNumber
  17. , octNumber
  18. , signedOctNumber
  19. , decNumber
  20. , signedDecNumber
  21. , dozNumber
  22. , signedDozNumber
  23. , hexNumber
  24. , signedHexNumber
  25. -- ** Numeric Literals for Arbitrary Bases
  26. , commonLispNumberAnyBase
  27. , gnuM4NumberAnyBase
  28. -- ** Source locations
  29. , Location(..), Located(..), located, dLocation
  30. ) where
  31. #if !MIN_VERSION_base(4,8,0)
  32. import Control.Applicative hiding ((<|>), many)
  33. #endif
  34. import Control.Monad (guard)
  35. import Data.Char
  36. import Data.Text (Text)
  37. import qualified Data.Text as T
  38. import Text.Parsec
  39. import Text.Parsec.Pos (newPos)
  40. import Text.Parsec.Text (Parser)
  41. -- | Parse an identifier according to the R5RS Scheme standard. This
  42. -- will not normalize case, even though the R5RS standard specifies
  43. -- that all identifiers be normalized to lower case first.
  44. --
  45. -- An R5RS identifier is, broadly speaking, alphabetic or numeric
  46. -- and may include various symbols, but no escapes.
  47. parseR5RSIdent :: Parser Text
  48. parseR5RSIdent =
  49. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  50. where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
  51. subsequent = initial <|> digit <|> oneOf "+-.@"
  52. peculiar = string "+" <|> string "-" <|> string "..."
  53. hasCategory :: Char -> [GeneralCategory] -> Bool
  54. hasCategory c cs = generalCategory c `elem` cs
  55. -- | Parse an identifier according to the R6RS Scheme standard. An
  56. -- R6RS identifier may include inline hexadecimal escape sequences
  57. -- so that, for example, @foo@ is equivalent to @f\\x6f;o@, and is
  58. -- more liberal than R5RS as to which Unicode characters it may
  59. -- accept.
  60. parseR6RSIdent :: Parser Text
  61. parseR6RSIdent =
  62. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  63. where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
  64. constituent = letter
  65. <|> uniClass (\ c -> isLetter c ||
  66. isSymbol c ||
  67. hasCategory c
  68. [ NonSpacingMark
  69. , LetterNumber
  70. , OtherNumber
  71. , DashPunctuation
  72. , ConnectorPunctuation
  73. , OtherPunctuation
  74. , PrivateUse
  75. ])
  76. inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
  77. subsequent = initial <|> digit <|> oneOf "+-.@"
  78. <|> uniClass (\ c -> hasCategory c
  79. [ DecimalNumber
  80. , SpacingCombiningMark
  81. , EnclosingMark
  82. ])
  83. peculiar = string "+" <|> string "-" <|> string "..." <|>
  84. ((++) <$> string "->" <*> many subsequent)
  85. uniClass :: (Char -> Bool) -> Parser Char
  86. uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
  87. -- | Parse an identifier according to the R7RS Scheme standard. An
  88. -- R7RS identifier, in addition to a typical identifier format,
  89. -- can also be a chunk of text surrounded by vertical bars that
  90. -- can contain spaces and other characters. Unlike R6RS, it does
  91. -- not allow escapes to be included in identifiers unless those
  92. -- identifiers are surrounded by vertical bars.
  93. parseR7RSIdent :: Parser Text
  94. parseR7RSIdent = T.pack <$>
  95. ( (:) <$> initial <*> many subsequent
  96. <|> char '|' *> many1 symbolElement <* char '|'
  97. <|> peculiar
  98. )
  99. where initial = letter <|> specInit
  100. specInit = oneOf "!$%&*/:<=>?^_~"
  101. subsequent = initial <|> digit <|> specSubsequent
  102. specSubsequent = expSign <|> oneOf ".@"
  103. expSign = oneOf "+-"
  104. symbolElement = noneOf "\\|"
  105. <|> hexEscape
  106. <|> mnemEscape
  107. <|> ('|' <$ string "\\|")
  108. hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
  109. mnemEscape = '\a' <$ string "\\a"
  110. <|> '\b' <$ string "\\b"
  111. <|> '\t' <$ string "\\t"
  112. <|> '\n' <$ string "\\n"
  113. <|> '\r' <$ string "\\r"
  114. peculiar = (:[]) <$> expSign
  115. <|> cons2 <$> expSign <*> signSub <*> many subsequent
  116. <|> cons3 <$> expSign
  117. <*> char '.'
  118. <*> dotSub
  119. <*> many subsequent
  120. <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
  121. dotSub = signSub <|> char '.'
  122. signSub = initial <|> expSign <|> char '@'
  123. cons2 a b cs = a : b : cs
  124. cons3 a b c ds = a : b : c : ds
  125. -- | Parse a Haskell variable identifier: a sequence of alphanumeric
  126. -- characters, underscores, or single quote that begins with a
  127. -- lower-case letter.
  128. parseHaskellVariable :: Parser Text
  129. parseHaskellVariable =
  130. T.pack <$> ((:) <$> small <*> many (small <|>
  131. large <|>
  132. digit' <|>
  133. char '\'' <|>
  134. char '_'))
  135. where small = satisfy isLower
  136. large = satisfy isUpper
  137. digit' = satisfy isDigit
  138. -- | Parse a Haskell constructor: a sequence of alphanumeric
  139. -- characters, underscores, or single quote that begins with an
  140. -- upper-case letter.
  141. parseHaskellConstructor :: Parser Text
  142. parseHaskellConstructor =
  143. T.pack <$> ((:) <$> large <*> many (small <|>
  144. large <|>
  145. digit' <|>
  146. char '\'' <|>
  147. char '_'))
  148. where small = satisfy isLower
  149. large = satisfy isUpper
  150. digit' = satisfy isDigit
  151. -- | Parse a Haskell identifer: a sequence of alphanumeric
  152. -- characters, underscores, or a single quote. This matches both
  153. -- variable and constructor names.
  154. parseHaskellIdent :: Parser Text
  155. parseHaskellIdent =
  156. T.pack <$> ((:) <$> (large <|> small)
  157. <*> many (small <|>
  158. large <|>
  159. digit' <|>
  160. char '\'' <|>
  161. char '_'))
  162. where small = satisfy isLower
  163. large = satisfy isUpper
  164. digit' = satisfy isDigit
  165. -- Ensure that a given character has the given Unicode category
  166. hasCat :: [GeneralCategory] -> Parser Char
  167. hasCat cats = satisfy (flip hasCategory cats)
  168. xidStart :: [GeneralCategory]
  169. xidStart = [ UppercaseLetter
  170. , LowercaseLetter
  171. , TitlecaseLetter
  172. , ModifierLetter
  173. , OtherLetter
  174. , LetterNumber
  175. ]
  176. xidContinue :: [GeneralCategory]
  177. xidContinue = xidStart ++ [ NonSpacingMark
  178. , SpacingCombiningMark
  179. , DecimalNumber
  180. , ConnectorPunctuation
  181. ]
  182. -- | Parse an identifier of unicode characters of the form
  183. -- @<XID_Start> <XID_Continue>*@, which corresponds strongly
  184. -- to the identifiers found in most C-like languages. Note that
  185. -- the @XID_Start@ category does not include the underscore,
  186. -- so @__foo@ is not a valid XID identifier. To parse
  187. -- identifiers that may include leading underscores, use
  188. -- 'parseXIDIdentGeneral'.
  189. parseXIDIdentStrict :: Parser Text
  190. parseXIDIdentStrict = T.pack <$> ((:) <$> hasCat xidStart
  191. <*> many (hasCat xidContinue))
  192. -- | Parse an identifier of unicode characters of the form
  193. -- @(<XID_Start> | '_') <XID_Continue>*@, which corresponds
  194. -- strongly to the identifiers found in most C-like languages.
  195. -- Unlike 'parseXIDIdentStrict', this will also accept an
  196. -- underscore as leading character, which corresponds more
  197. -- closely to programming languages like C and Java, but
  198. -- deviates somewhat from the
  199. -- <http://unicode.org/reports/tr31/ Unicode Identifier and
  200. -- Pattern Syntax standard>.
  201. parseXIDIdentGeneral :: Parser Text
  202. parseXIDIdentGeneral = T.pack <$> ((:) <$> (hasCat xidStart <|> char '_')
  203. <*> many (hasCat xidContinue))
  204. -- | A helper function for defining parsers for arbitrary-base integers.
  205. -- The first argument will be the base, and the second will be the
  206. -- parser for the individual digits.
  207. number :: Integer -> Parser Char -> Parser Integer
  208. number base digits = foldl go 0 <$> many1 digits
  209. where go x d = base * x + toInteger (value d)
  210. value c
  211. | c >= 'a' && c <= 'z' = 0xa + (fromEnum c - fromEnum 'a')
  212. | c >= 'A' && c <= 'Z' = 0xa + (fromEnum c - fromEnum 'A')
  213. | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
  214. | c == '\x218a' = 0xa
  215. | c == '\x218b' = 0xb
  216. | otherwise = error ("Unknown letter in number: " ++ show c)
  217. digitsFor :: Int -> [Char]
  218. digitsFor n
  219. | n <= 10 = take n ['0'..'9']
  220. | n <= 36 = take (n-10) ['A'..'Z'] ++ take (n-10) ['a'..'z'] ++ ['0'..'9']
  221. | otherwise = error ("Invalid base for parser: " ++ show n)
  222. anyBase :: Integer -> Parser Integer
  223. anyBase n = number n (oneOf (digitsFor (fromIntegral n)))
  224. -- | A parser for Common Lisp's arbitrary-base number syntax, of
  225. -- the form @#[base]r[number]@, where the base is given in
  226. -- decimal. Note that this syntax begins with a @#@, which
  227. -- means it might conflict with defined reader macros.
  228. commonLispNumberAnyBase :: Parser Integer
  229. commonLispNumberAnyBase = do
  230. _ <- char '#'
  231. n <- decNumber
  232. guard (n >= 2 && n <= 36)
  233. _ <- char 'r'
  234. signed (anyBase n)
  235. -- | A parser for GNU m4's arbitrary-base number syntax, of
  236. -- the form @0r[base]:[number]@, where the base is given in
  237. -- decimal.
  238. gnuM4NumberAnyBase :: Parser Integer
  239. gnuM4NumberAnyBase = do
  240. _ <- string "0r"
  241. n <- decNumber
  242. guard (n >= 2 && n <= 36)
  243. _ <- char ':'
  244. signed (anyBase n)
  245. sign :: Num a => Parser (a -> a)
  246. sign = (pure id <* char '+')
  247. <|> (pure negate <* char '-')
  248. <|> pure id
  249. -- | Given a parser for some kind of numeric literal, this will attempt to
  250. -- parse a leading @+@ or a leading @-@ followed by the numeric literal,
  251. -- and if a @-@ is found, negate that literal.
  252. signed :: Num a => Parser a -> Parser a
  253. signed p = ($) <$> sign <*> p
  254. -- | Parses a number in the same way as 'prefixedNumber', with an optional
  255. -- leading @+@ or @-@.
  256. signedPrefixedNumber :: Parser Integer
  257. signedPrefixedNumber = signed prefixedNumber
  258. -- | Parses a number, determining which numeric base to use by examining
  259. -- the literal's prefix: @0x@ for a hexadecimal number, @0z@ for a
  260. -- dozenal number, @0o@ for an octal number, and @0b@ for a binary
  261. -- number (as well as the upper-case versions of the same.) If the
  262. -- base is omitted entirely, then it is treated as a decimal number.
  263. prefixedNumber :: Parser Integer
  264. prefixedNumber = (string "0x" <|> string "0X") *> hexNumber
  265. <|> (string "0o" <|> string "0O") *> octNumber
  266. <|> (string "0z" <|> string "0Z") *> dozNumber
  267. <|> (string "0b" <|> string "0B") *> binNumber
  268. <|> decNumber
  269. -- | A parser for non-signed binary numbers
  270. binNumber :: Parser Integer
  271. binNumber = number 2 (char '0' <|> char '1')
  272. -- | A parser for signed binary numbers, with an optional leading @+@ or @-@.
  273. signedBinNumber :: Parser Integer
  274. signedBinNumber = signed binNumber
  275. -- | A parser for non-signed octal numbers
  276. octNumber :: Parser Integer
  277. octNumber = number 8 (oneOf "01234567")
  278. -- | A parser for signed octal numbers, with an optional leading @+@ or @-@.
  279. signedOctNumber :: Parser Integer
  280. signedOctNumber = ($) <$> sign <*> octNumber
  281. -- | A parser for non-signed decimal numbers
  282. decNumber :: Parser Integer
  283. decNumber = number 10 digit
  284. -- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.
  285. signedDecNumber :: Parser Integer
  286. signedDecNumber = ($) <$> sign <*> decNumber
  287. dozDigit :: Parser Char
  288. dozDigit = digit <|> oneOf "AaBb\x218a\x218b"
  289. -- | A parser for non-signed duodecimal (dozenal) numbers. This understands both
  290. -- the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)
  291. -- and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@
  292. -- respectively.
  293. dozNumber :: Parser Integer
  294. dozNumber = number 12 dozDigit
  295. -- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.
  296. signedDozNumber :: Parser Integer
  297. signedDozNumber = ($) <$> sign <*> dozNumber
  298. -- | A parser for non-signed hexadecimal numbers
  299. hexNumber :: Parser Integer
  300. hexNumber = number 16 hexDigit
  301. -- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.
  302. signedHexNumber :: Parser Integer
  303. signedHexNumber = ($) <$> sign <*> hexNumber
  304. -- |
  305. data Location = Span !SourcePos !SourcePos
  306. deriving (Eq, Ord, Show)
  307. -- | Add support for source locations while parsing S-expressions, as described in this
  308. -- <https://www.reddit.com/r/haskell/comments/4x22f9/labelling_ast_nodes_with_locations/d6cmdy9/ Reddit>
  309. -- thread.
  310. data Located a = At !Location a
  311. deriving (Eq, Ord, Show)
  312. -- | Adds a source span to a parser.
  313. located :: Parser a -> Parser (Located a)
  314. located parser = do
  315. begin <- getPosition
  316. result <- parser
  317. end <- getPosition
  318. return $ At (Span begin end) result
  319. -- | A default location value
  320. dLocation :: Location
  321. dLocation = Span dPos dPos
  322. where dPos = newPos "" 0 0
  323. {- $intro
  324. This module contains a selection of parsers for different kinds of
  325. identifiers and literals, from which more elaborate parsers can be
  326. assembled. These can afford the user a quick way of building parsers
  327. for different atom types.
  328. -}