123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- module Data.SCargot.Common ( number
- , decNumber
- , hexNumber
- , octNumber
- , sign
- -- * Lisp Identifier Syntaxes
- , parseR5RSIdent
- , parseR6RSIdent
- , parseR7RSIdent
- ) where
- import Data.Char
- import Data.Text (Text)
- import qualified Data.Text as T
- import Text.Parsec
- import Text.Parsec.Char (satisfy)
- import Text.Parsec.Text (Parser)
- -- | Parse an identifier according to the R5RS Scheme standard. This
- -- will not normalize case, even though the R5RS standard specifies
- -- that all identifiers be normalized to lower case first.
- --
- -- An R5RS identifier is, broadly speaking, alphabetic or numeric
- -- and may include various symbols, but no escapes.
- parseR5RSIdent :: Parser Text
- parseR5RSIdent =
- T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
- where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
- subsequent = initial <|> digit <|> oneOf "+-.@"
- peculiar = string "+" <|> string "-" <|> string "..."
- hasCategory :: Char -> [GeneralCategory] -> Bool
- hasCategory c cs = generalCategory c `elem` cs
- -- | Parse an identifier according to the R6RS Scheme standard. An
- -- R6RS identifier may include inline hexadecimal escape sequences
- -- so that, for example, 'foo' is equivalent to 'f\x6f;o', and is
- -- more liberal than R5RS as to which Unicode characters it may
- -- accept.
- parseR6RSIdent :: Parser Text
- parseR6RSIdent =
- T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
- where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
- constituent = letter
- <|> uniClass (\ c -> isLetter c ||
- isSymbol c ||
- hasCategory c
- [ NonSpacingMark
- , LetterNumber
- , OtherNumber
- , DashPunctuation
- , ConnectorPunctuation
- , OtherPunctuation
- , PrivateUse
- ])
- inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
- subsequent = initial <|> digit <|> oneOf "+-.@"
- <|> uniClass (\ c -> hasCategory c
- [ DecimalNumber
- , SpacingCombiningMark
- , EnclosingMark
- ])
- peculiar = string "+" <|> string "-" <|> string "..." <|>
- ((++) <$> string "->" <*> many subsequent)
- uniClass :: (Char -> Bool) -> Parser Char
- uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
- -- | Parse an identifier according to the R7RS Scheme standard. An
- -- R7RS identifier, in addition to a typical identifier format,
- -- can also be a chunk of text surrounded by vertical bars that
- -- can contain spaces and other characters. Unlike R6RS, it does
- -- not allow escapes to be included in identifiers that are not
- -- surrounded by vertical bars.
- parseR7RSIdent :: Parser Text
- parseR7RSIdent = T.pack <$>
- ( (:) <$> initial <*> many subsequent
- <|> char '|' *> many1 symbolElement <* char '|'
- <|> peculiar
- )
- where initial = letter <|> specInit
- specInit = oneOf "!$%&*/:<=>?^_~"
- subsequent = initial <|> digit <|> specSubsequent
- specSubsequent = expSign <|> oneOf ".@"
- expSign = oneOf "+-"
- symbolElement = noneOf "\\|"
- <|> hexEscape
- <|> mnemEscape
- <|> ('|' <$ string "\\|")
- hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
- mnemEscape = '\a' <$ string "\\a"
- <|> '\b' <$ string "\\b"
- <|> '\t' <$ string "\\t"
- <|> '\n' <$ string "\\n"
- <|> '\r' <$ string "\\r"
- peculiar = (:[]) <$> expSign
- <|> cons2 <$> expSign <*> signSub <*> many subsequent
- <|> cons3 <$> expSign
- <*> char '.'
- <*> dotSub
- <*> many subsequent
- <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
- dotSub = signSub <|> char '.'
- signSub = initial <|> expSign <|> char '@'
- cons2 a b cs = a : b : cs
- cons3 a b c ds = a : b : c : ds
- -- | A helper function for defining parsers for arbitrary-base integers.
- -- The first argument will be the base, and the second will be the
- -- parser for the individual digits.
- number :: Integer -> Parser Char -> Parser Integer
- number base digits = foldl go 0 <$> many1 digits
- where go x d = base * x + toInteger (value d)
- value c
- | c == 'a' || c == 'A' = 0xa
- | c == 'b' || c == 'B' = 0xb
- | c == 'c' || c == 'C' = 0xc
- | c == 'd' || c == 'D' = 0xd
- | c == 'e' || c == 'E' = 0xe
- | c == 'f' || c == 'F' = 0xf
- | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
- | otherwise = error ("Unknown letter in number: " ++ show c)
- -- | A parser for bare binary numbers
- binNumber :: Parser Integer
- binNumber = number 2 (char '0' <|> char '1')
- -- | A parser for bare octal numbers
- octNumber :: Parser Integer
- octNumber = number 8 digit
- -- | A parser for bare decimal numbers
- decNumber :: Parser Integer
- decNumber = number 10 digit
- -- | A parser for bare hexadecimal numbers
- hexNumber :: Parser Integer
- hexNumber = number 16 hexDigit
- -- | A parser for numeric signs, represented as a function from numbers
- -- to numbers. It will parse '+' as the identity function, '-', as
- -- 'negate', or consume no input and return the identity function.
- -- This can be combined with other numeric literals to implement
- -- signedness:
- --
- -- > myNum = go <$> sign <*> decNumber
- -- > where go s n = s n
- sign :: Num a => Parser (a -> a)
- sign = (pure id <* char '+')
- <|> (pure negate <* char '-')
- <|> pure id
|