Common.hs 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. module Data.SCargot.Common ( number
  2. , decNumber
  3. , hexNumber
  4. , octNumber
  5. , sign
  6. -- * Lisp Identifier Syntaxes
  7. , parseR5RSIdent
  8. , parseR6RSIdent
  9. , parseR7RSIdent
  10. ) where
  11. import Data.Char
  12. import Data.Text (Text)
  13. import qualified Data.Text as T
  14. import Text.Parsec
  15. import Text.Parsec.Char (satisfy)
  16. import Text.Parsec.Text (Parser)
  17. -- | Parse an identifier according to the R5RS Scheme standard. This
  18. -- will not normalize case, even though the R5RS standard specifies
  19. -- that all identifiers be normalized to lower case first.
  20. --
  21. -- An R5RS identifier is, broadly speaking, alphabetic or numeric
  22. -- and may include various symbols, but no escapes.
  23. parseR5RSIdent :: Parser Text
  24. parseR5RSIdent =
  25. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  26. where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
  27. subsequent = initial <|> digit <|> oneOf "+-.@"
  28. peculiar = string "+" <|> string "-" <|> string "..."
  29. hasCategory :: Char -> [GeneralCategory] -> Bool
  30. hasCategory c cs = generalCategory c `elem` cs
  31. -- | Parse an identifier according to the R6RS Scheme standard. An
  32. -- R6RS identifier may include inline hexadecimal escape sequences
  33. -- so that, for example, 'foo' is equivalent to 'f\x6f;o', and is
  34. -- more liberal than R5RS as to which Unicode characters it may
  35. -- accept.
  36. parseR6RSIdent :: Parser Text
  37. parseR6RSIdent =
  38. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  39. where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
  40. constituent = letter
  41. <|> uniClass (\ c -> isLetter c ||
  42. isSymbol c ||
  43. hasCategory c
  44. [ NonSpacingMark
  45. , LetterNumber
  46. , OtherNumber
  47. , DashPunctuation
  48. , ConnectorPunctuation
  49. , OtherPunctuation
  50. , PrivateUse
  51. ])
  52. inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
  53. subsequent = initial <|> digit <|> oneOf "+-.@"
  54. <|> uniClass (\ c -> hasCategory c
  55. [ DecimalNumber
  56. , SpacingCombiningMark
  57. , EnclosingMark
  58. ])
  59. peculiar = string "+" <|> string "-" <|> string "..." <|>
  60. ((++) <$> string "->" <*> many subsequent)
  61. uniClass :: (Char -> Bool) -> Parser Char
  62. uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
  63. -- | Parse an identifier according to the R7RS Scheme standard. An
  64. -- R7RS identifier, in addition to a typical identifier format,
  65. -- can also be a chunk of text surrounded by vertical bars that
  66. -- can contain spaces and other characters. Unlike R6RS, it does
  67. -- not allow escapes to be included in identifiers that are not
  68. -- surrounded by vertical bars.
  69. parseR7RSIdent :: Parser Text
  70. parseR7RSIdent = T.pack <$>
  71. ( (:) <$> initial <*> many subsequent
  72. <|> char '|' *> many1 symbolElement <* char '|'
  73. <|> peculiar
  74. )
  75. where initial = letter <|> specInit
  76. specInit = oneOf "!$%&*/:<=>?^_~"
  77. subsequent = initial <|> digit <|> specSubsequent
  78. specSubsequent = expSign <|> oneOf ".@"
  79. expSign = oneOf "+-"
  80. symbolElement = noneOf "\\|"
  81. <|> hexEscape
  82. <|> mnemEscape
  83. <|> ('|' <$ string "\\|")
  84. hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
  85. mnemEscape = '\a' <$ string "\\a"
  86. <|> '\b' <$ string "\\b"
  87. <|> '\t' <$ string "\\t"
  88. <|> '\n' <$ string "\\n"
  89. <|> '\r' <$ string "\\r"
  90. peculiar = (:[]) <$> expSign
  91. <|> cons2 <$> expSign <*> signSub <*> many subsequent
  92. <|> cons3 <$> expSign
  93. <*> char '.'
  94. <*> dotSub
  95. <*> many subsequent
  96. <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
  97. dotSub = signSub <|> char '.'
  98. signSub = initial <|> expSign <|> char '@'
  99. cons2 a b cs = a : b : cs
  100. cons3 a b c ds = a : b : c : ds
  101. -- | A helper function for defining parsers for arbitrary-base integers.
  102. -- The first argument will be the base, and the second will be the
  103. -- parser for the individual digits.
  104. number :: Integer -> Parser Char -> Parser Integer
  105. number base digits = foldl go 0 <$> many1 digits
  106. where go x d = base * x + toInteger (value d)
  107. value c
  108. | c == 'a' || c == 'A' = 0xa
  109. | c == 'b' || c == 'B' = 0xb
  110. | c == 'c' || c == 'C' = 0xc
  111. | c == 'd' || c == 'D' = 0xd
  112. | c == 'e' || c == 'E' = 0xe
  113. | c == 'f' || c == 'F' = 0xf
  114. | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
  115. | otherwise = error ("Unknown letter in number: " ++ show c)
  116. -- | A parser for bare binary numbers
  117. binNumber :: Parser Integer
  118. binNumber = number 2 (char '0' <|> char '1')
  119. -- | A parser for bare octal numbers
  120. octNumber :: Parser Integer
  121. octNumber = number 8 digit
  122. -- | A parser for bare decimal numbers
  123. decNumber :: Parser Integer
  124. decNumber = number 10 digit
  125. -- | A parser for bare hexadecimal numbers
  126. hexNumber :: Parser Integer
  127. hexNumber = number 16 hexDigit
  128. -- | A parser for numeric signs, represented as a function from numbers
  129. -- to numbers. It will parse '+' as the identity function, '-', as
  130. -- 'negate', or consume no input and return the identity function.
  131. -- This can be combined with other numeric literals to implement
  132. -- signedness:
  133. --
  134. -- > myNum = go <$> sign <*> decNumber
  135. -- > where go s n = s n
  136. sign :: Num a => Parser (a -> a)
  137. sign = (pure id <* char '+')
  138. <|> (pure negate <* char '-')
  139. <|> pure id