Common.hs 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. module Data.SCargot.Common ( number
  2. , decNumber
  3. , hexNumber
  4. , octNumber
  5. , sign
  6. -- * Lisp Identifier Syntaxes
  7. , parseR5RSIdent
  8. , parseR6RSIdent
  9. , parseR7RSIdent
  10. ) where
  11. import Data.Char
  12. import Data.Text (Text)
  13. import qualified Data.Text as T
  14. import Text.Parsec
  15. import Text.Parsec.Char (satisfy)
  16. import Text.Parsec.Text (Parser)
  17. -- | Parse an identifier according to the R5RS Scheme standard. This
  18. -- will not normalize case, even though the R5RS standard specifies
  19. -- that all identifiers be normalized to lower case first.
  20. --
  21. -- An R5RS identifier is, broadly speaking, alphabetic or numeric
  22. -- and may include various symbols, but no escapes.
  23. parseR5RSIdent :: Parser Text
  24. parseR5RSIdent =
  25. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  26. where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
  27. subsequent = initial <|> digit <|> oneOf "+-.@"
  28. peculiar = string "+" <|> string "-" <|> string "..."
  29. hasCategory :: Char -> [GeneralCategory] -> Bool
  30. hasCategory c cs = generalCategory c `elem` cs
  31. -- | Parse an identifier according to the R6RS Scheme standard. An
  32. -- R6RS identifier may include inline hexadecimal escape sequences
  33. -- so that, for example, 'foo' is equivalent to 'f\x6f;o', and is
  34. -- more liberal than R5RS as to which Unicode characters it may
  35. -- accept.
  36. parseR6RSIdent :: Parser Text
  37. parseR6RSIdent =
  38. T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  39. where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
  40. constituent = letter
  41. <|> uniClass (\ c -> isLetter c ||
  42. isSymbol c ||
  43. hasCategory c
  44. [ NonSpacingMark
  45. , LetterNumber
  46. , OtherNumber
  47. , DashPunctuation
  48. , ConnectorPunctuation
  49. , OtherPunctuation
  50. , PrivateUse
  51. ])
  52. inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
  53. subsequent = initial <|> digit <|> oneOf "+-.@"
  54. <|> uniClass (\ c -> hasCategory c
  55. [ DecimalNumber
  56. , SpacingCombiningMark
  57. , EnclosingMark
  58. ])
  59. peculiar = string "+" <|> string "-" <|> string "..." <|>
  60. ((++) <$> string "->" <*> many subsequent)
  61. uniClass :: (Char -> Bool) -> Parser Char
  62. uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
  63. -- | Parse an identifier according to the R7RS Scheme standard. An
  64. -- R7RS identifier, in addition to a typical identifier format,
  65. -- can also be a chunk of text surrounded by vertical bars that
  66. -- can contain spaces and other characters. Unlike R6RS, it does
  67. -- not allow escapes to be included in identifiers that are not
  68. -- surrounded by vertical bars.
  69. parseR7RSIdent :: Parser Text
  70. parseR7RSIdent = T.pack <$>
  71. ( (:) <$> initial <*> many subsequent
  72. <|> char '|' *> many1 symbolElement <* char '|'
  73. <|> peculiar
  74. )
  75. where initial = letter <|> specInit
  76. specInit = oneOf "!$%&*/:<=>?^_~"
  77. subsequent = initial <|> digit <|> specSubsequent
  78. specSubsequent = expSign <|> oneOf ".@"
  79. expSign = oneOf "+-"
  80. symbolElement = undefined
  81. peculiar = undefined
  82. -- | A helper function for defining parsers for arbitrary-base integers.
  83. -- The first argument will be the base, and the second will be the
  84. -- parser for the individual digits.
  85. number :: Integer -> Parser Char -> Parser Integer
  86. number base digits = foldl go 0 <$> many1 digits
  87. where go x d = base * x + toInteger (value d)
  88. value c
  89. | c == 'a' || c == 'A' = 0xa
  90. | c == 'b' || c == 'B' = 0xb
  91. | c == 'c' || c == 'C' = 0xc
  92. | c == 'd' || c == 'D' = 0xd
  93. | c == 'e' || c == 'E' = 0xe
  94. | c == 'f' || c == 'F' = 0xf
  95. | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
  96. | otherwise = error ("Unknown letter in number: " ++ show c)
  97. -- | A parser for bare binary numbers
  98. binNumber :: Parser Integer
  99. binNumber = number 2 (char '0' <|> char '1')
  100. -- | A parser for bare octal numbers
  101. octNumber :: Parser Integer
  102. octNumber = number 8 digit
  103. -- | A parser for bare decimal numbers
  104. decNumber :: Parser Integer
  105. decNumber = number 10 digit
  106. -- | A parser for bare hexadecimal numbers
  107. hexNumber :: Parser Integer
  108. hexNumber = number 16 hexDigit
  109. -- | A parser for numeric signs, represented as a function from numbers
  110. -- to numbers. It will parse '+' as the identity function, '-', as
  111. -- 'negate', or consume no input and return the identity function.
  112. -- This can be combined with other numeric literals to implement
  113. -- signedness:
  114. --
  115. -- > myNum = go <$> sign <*> decNumber
  116. -- > where go s n = s n
  117. sign :: Num a => Parser (a -> a)
  118. sign = (pure id <* char '+')
  119. <|> (pure negate <* char '-')
  120. <|> pure id