Common.hs 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. module Data.SCargot.Common ( -- * Numeric Literal Parsers
  2. binNumber
  3. , signedBinNumber
  4. , octNumber
  5. , signedOctNumber
  6. , decNumber
  7. , signedDecNumber
  8. , dozNumber
  9. , signedDozNumber
  10. , hexNumber
  11. , signedHexNumber
  12. , signed
  13. -- * Lisp Identifier Syntaxes
  14. , parseR5RSIdent
  15. , parseR6RSIdent
  16. , parseR7RSIdent
  17. ) where
  18. import Data.Char
  19. import Data.Text (Text)
  20. import qualified Data.Text as T
  21. import Text.Parsec
  22. import Text.Parsec.Char (satisfy)
  23. import Text.Parsec.Text (Parser)
  24. -- | Parse an identifier according to the R5RS Scheme standard. This
  25. -- will not normalize case, even though the R5RS standard specifies
  26. -- that all identifiers be normalized to lower case first.
  27. --
  28. -- An R5RS identifier is, broadly speaking, alphabetic or numeric
  29. -- and may include various symbols, but no escapes.
  30. parseR5RSIdent :: Parser Text
  31. parseR5RSIdent =
  32. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  33. where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
  34. subsequent = initial <|> digit <|> oneOf "+-.@"
  35. peculiar = string "+" <|> string "-" <|> string "..."
  36. hasCategory :: Char -> [GeneralCategory] -> Bool
  37. hasCategory c cs = generalCategory c `elem` cs
  38. -- | Parse an identifier according to the R6RS Scheme standard. An
  39. -- R6RS identifier may include inline hexadecimal escape sequences
  40. -- so that, for example, @foo@ is equivalent to @f\x6f;o@, and is
  41. -- more liberal than R5RS as to which Unicode characters it may
  42. -- accept.
  43. parseR6RSIdent :: Parser Text
  44. parseR6RSIdent =
  45. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  46. where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
  47. constituent = letter
  48. <|> uniClass (\ c -> isLetter c ||
  49. isSymbol c ||
  50. hasCategory c
  51. [ NonSpacingMark
  52. , LetterNumber
  53. , OtherNumber
  54. , DashPunctuation
  55. , ConnectorPunctuation
  56. , OtherPunctuation
  57. , PrivateUse
  58. ])
  59. inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
  60. subsequent = initial <|> digit <|> oneOf "+-.@"
  61. <|> uniClass (\ c -> hasCategory c
  62. [ DecimalNumber
  63. , SpacingCombiningMark
  64. , EnclosingMark
  65. ])
  66. peculiar = string "+" <|> string "-" <|> string "..." <|>
  67. ((++) <$> string "->" <*> many subsequent)
  68. uniClass :: (Char -> Bool) -> Parser Char
  69. uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
  70. -- | Parse an identifier according to the R7RS Scheme standard. An
  71. -- R7RS identifier, in addition to a typical identifier format,
  72. -- can also be a chunk of text surrounded by vertical bars that
  73. -- can contain spaces and other characters. Unlike R6RS, it does
  74. -- not allow escapes to be included in identifiers unless those
  75. -- identifiers are surrounded by vertical bars.
  76. parseR7RSIdent :: Parser Text
  77. parseR7RSIdent = T.pack <$>
  78. ( (:) <$> initial <*> many subsequent
  79. <|> char '|' *> many1 symbolElement <* char '|'
  80. <|> peculiar
  81. )
  82. where initial = letter <|> specInit
  83. specInit = oneOf "!$%&*/:<=>?^_~"
  84. subsequent = initial <|> digit <|> specSubsequent
  85. specSubsequent = expSign <|> oneOf ".@"
  86. expSign = oneOf "+-"
  87. symbolElement = noneOf "\\|"
  88. <|> hexEscape
  89. <|> mnemEscape
  90. <|> ('|' <$ string "\\|")
  91. hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
  92. mnemEscape = '\a' <$ string "\\a"
  93. <|> '\b' <$ string "\\b"
  94. <|> '\t' <$ string "\\t"
  95. <|> '\n' <$ string "\\n"
  96. <|> '\r' <$ string "\\r"
  97. peculiar = (:[]) <$> expSign
  98. <|> cons2 <$> expSign <*> signSub <*> many subsequent
  99. <|> cons3 <$> expSign
  100. <*> char '.'
  101. <*> dotSub
  102. <*> many subsequent
  103. <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
  104. dotSub = signSub <|> char '.'
  105. signSub = initial <|> expSign <|> char '@'
  106. cons2 a b cs = a : b : cs
  107. cons3 a b c ds = a : b : c : ds
  108. -- | A helper function for defining parsers for arbitrary-base integers.
  109. -- The first argument will be the base, and the second will be the
  110. -- parser for the individual digits.
  111. number :: Integer -> Parser Char -> Parser Integer
  112. number base digits = foldl go 0 <$> many1 digits
  113. where go x d = base * x + toInteger (value d)
  114. value c
  115. | c == 'a' || c == 'A' = 0xa
  116. | c == 'b' || c == 'B' = 0xb
  117. | c == 'c' || c == 'C' = 0xc
  118. | c == 'd' || c == 'D' = 0xd
  119. | c == 'e' || c == 'E' = 0xe
  120. | c == 'f' || c == 'F' = 0xf
  121. | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
  122. | c == '\x218a' = 0xa
  123. | c == '\x218b' = 0xb
  124. | otherwise = error ("Unknown letter in number: " ++ show c)
  125. sign :: Num a => Parser (a -> a)
  126. sign = (pure id <* char '+')
  127. <|> (pure negate <* char '-')
  128. <|> pure id
  129. -- | Given a parser for some kind of numeric literal, this will attempt to
  130. -- parse a leading @+@ or a leading @-@ followed by the numeric literal,
  131. -- and if a @-@ is found, negate that literal.
  132. signed :: Num a => Parser a -> Parser a
  133. signed p = ($) <$> sign <*> p
  134. -- | A parser for non-signed binary numbers
  135. binNumber :: Parser Integer
  136. binNumber = number 2 (char '0' <|> char '1')
  137. -- | A parser for signed binary numbers, with an optional leading @+@ or @-@.
  138. signedBinNumber :: Parser Integer
  139. signedBinNumber = ($) <$> sign <*> binNumber
  140. -- | A parser for non-signed octal numbers
  141. octNumber :: Parser Integer
  142. octNumber = number 8 (oneOf "01234567")
  143. -- | A parser for signed octal numbers, with an optional leading @+@ or @-@.
  144. signedOctNumber :: Parser Integer
  145. signedOctNumber = ($) <$> sign <*> octNumber
  146. -- | A parser for non-signed decimal numbers
  147. decNumber :: Parser Integer
  148. decNumber = number 10 digit
  149. -- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.
  150. signedDecNumber :: Parser Integer
  151. signedDecNumber = ($) <$> sign <*> decNumber
  152. dozDigit = digit <|> oneOf "AaBb\x218a\x218b"
  153. -- | A parser for non-signed duodecimal (dozenal) numbers. This understands both
  154. -- the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)
  155. -- and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@
  156. -- respectively.
  157. dozNumber :: Parser Integer
  158. dozNumber = number 16 dozDigit
  159. -- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.
  160. signedDozNumber :: Parser Integer
  161. signedDozNumber = ($) <$> sign <*> dozNumber
  162. -- | A parser for non-signed hexadecimal numbers
  163. hexNumber :: Parser Integer
  164. hexNumber = number 16 hexDigit
  165. -- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.
  166. signedHexNumber :: Parser Integer
  167. signedHexNumber = ($) <$> sign <*> hexNumber