| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 | module Data.SCargot.Common ( -- $intro                           -- * Identifier Syntaxes                             parseR5RSIdent                           , parseR6RSIdent                           , parseR7RSIdent                           , parseXIDIdentStrict                           , parseXIDIdentGeneral                           , parseHaskellIdent                           , parseHaskellVariable                           , parseHaskellConstructor                             -- * Numeric Literal Parsers                           , signed                           , prefixedNumber                           , signedPrefixedNumber                           , binNumber                           , signedBinNumber                           , octNumber                           , signedOctNumber                           , decNumber                           , signedDecNumber                           , dozNumber                           , signedDozNumber                           , hexNumber                           , signedHexNumber                             -- ** Numeric Literals for Arbitrary Bases                           , commonLispNumberAnyBase                           , gnuM4NumberAnyBase                             -- ** Source locations                           , Location(..), Located(..), located, dLocation                           ) where#if !MIN_VERSION_base(4,8,0)import Control.Applicative hiding ((<|>), many)#endifimport           Control.Monad (guard)import           Data.Charimport           Data.Text (Text)import qualified Data.Text as Timport           Text.Parsecimport           Text.Parsec.Pos  (newPos)import           Text.Parsec.Text (Parser)-- | Parse an identifier according to the R5RS Scheme standard. This--   will not normalize case, even though the R5RS standard specifies--   that all identifiers be normalized to lower case first.----   An R5RS identifier is, broadly speaking, alphabetic or numeric--   and may include various symbols, but no escapes.parseR5RSIdent :: Parser TextparseR5RSIdent =  T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)  where initial    = letter <|> oneOf "!$%&*/:<=>?^_~"        subsequent = initial <|> digit <|> oneOf "+-.@"        peculiar   = string "+" <|> string "-" <|> string "..."hasCategory :: Char -> [GeneralCategory] -> BoolhasCategory c cs = generalCategory c `elem` cs-- | Parse an identifier according to the R6RS Scheme standard. An--   R6RS identifier may include inline hexadecimal escape sequences--   so that, for example, @foo@ is equivalent to @f\\x6f;o@, and is--   more liberal than R5RS as to which Unicode characters it may--   accept.parseR6RSIdent :: Parser TextparseR6RSIdent =  T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)  where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex        constituent = letter                   <|> uniClass (\ c -> isLetter c ||                                        isSymbol c ||                                        hasCategory c                                          [ NonSpacingMark                                          , LetterNumber                                          , OtherNumber                                          , DashPunctuation                                          , ConnectorPunctuation                                          , OtherPunctuation                                          , PrivateUse                                          ])        inlineHex   = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')        subsequent  = initial <|> digit <|> oneOf "+-.@"                   <|> uniClass (\ c -> hasCategory c                                          [ DecimalNumber                                          , SpacingCombiningMark                                          , EnclosingMark                                          ])        peculiar    = string "+" <|> string "-" <|> string "..." <|>                      ((++) <$> string "->" <*> many subsequent)        uniClass :: (Char -> Bool) -> Parser Char        uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)-- | Parse an identifier according to the R7RS Scheme standard. An--   R7RS identifier, in addition to a typical identifier format,--   can also be a chunk of text surrounded by vertical bars that--   can contain spaces and other characters. Unlike R6RS, it does--   not allow escapes to be included in identifiers unless those--   identifiers are surrounded by vertical bars.parseR7RSIdent :: Parser TextparseR7RSIdent =  T.pack <$>          (  (:) <$> initial <*> many subsequent         <|> char '|' *> many1 symbolElement <* char '|'         <|> peculiar          )  where initial = letter <|> specInit        specInit = oneOf "!$%&*/:<=>?^_~"        subsequent = initial <|> digit <|> specSubsequent        specSubsequent = expSign <|> oneOf ".@"        expSign = oneOf "+-"        symbolElement =  noneOf "\\|"                     <|> hexEscape                     <|> mnemEscape                     <|> ('|' <$ string "\\|")        hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')        mnemEscape =  '\a' <$ string "\\a"                  <|> '\b' <$ string "\\b"                  <|> '\t' <$ string "\\t"                  <|> '\n' <$ string "\\n"                  <|> '\r' <$ string "\\r"        peculiar =  (:[]) <$> expSign                <|> cons2 <$> expSign <*> signSub <*> many subsequent                <|> cons3 <$> expSign                          <*> char '.'                          <*> dotSub                          <*> many subsequent                <|> cons2 <$> char '.' <*> dotSub <*> many subsequent        dotSub = signSub <|> char '.'        signSub = initial <|> expSign <|> char '@'        cons2 a b cs   = a : b : cs        cons3 a b c ds = a : b : c : ds-- | Parse a Haskell variable identifier: a sequence of alphanumeric--   characters, underscores, or single quote that begins with a--   lower-case letter.parseHaskellVariable :: Parser TextparseHaskellVariable =  T.pack <$> ((:) <$> small <*> many (small <|>                                      large <|>                                      digit' <|>                                      char '\'' <|>                                      char '_'))  where small = satisfy isLower        large = satisfy isUpper        digit' = satisfy isDigit-- | Parse a Haskell constructor: a sequence of alphanumeric--   characters, underscores, or single quote that begins with an--   upper-case letter.parseHaskellConstructor :: Parser TextparseHaskellConstructor =  T.pack <$> ((:) <$> large <*> many (small <|>                                      large <|>                                      digit' <|>                                      char '\'' <|>                                      char '_'))  where small = satisfy isLower        large = satisfy isUpper        digit' = satisfy isDigit-- | Parse a Haskell identifer: a sequence of alphanumeric--   characters, underscores, or a single quote. This matches both--   variable and constructor names.parseHaskellIdent :: Parser TextparseHaskellIdent =  T.pack <$> ((:) <$> (large <|> small)                  <*> many (small <|>                            large <|>                            digit' <|>                            char '\'' <|>                            char '_'))  where small = satisfy isLower        large = satisfy isUpper        digit' = satisfy isDigit-- Ensure that a given character has the given Unicode categoryhasCat :: [GeneralCategory] -> Parser CharhasCat cats = satisfy (flip hasCategory cats)xidStart :: [GeneralCategory]xidStart = [ UppercaseLetter           , LowercaseLetter           , TitlecaseLetter           , ModifierLetter           , OtherLetter           , LetterNumber           ]xidContinue :: [GeneralCategory]xidContinue = xidStart ++ [ NonSpacingMark                          , SpacingCombiningMark                          , DecimalNumber                          , ConnectorPunctuation                          ]-- | Parse an identifier of unicode characters of the form--   @<XID_Start> <XID_Continue>*@, which corresponds strongly--   to the identifiers found in most C-like languages. Note that--   the @XID_Start@ category does not include the underscore,--   so @__foo@ is not a valid XID identifier. To parse--   identifiers that may include leading underscores, use--   'parseXIDIdentGeneral'.parseXIDIdentStrict :: Parser TextparseXIDIdentStrict = T.pack <$> ((:) <$> hasCat xidStart                                  <*> many (hasCat xidContinue))-- | Parse an identifier of unicode characters of the form--   @(<XID_Start> | '_') <XID_Continue>*@, which corresponds--   strongly to the identifiers found in most C-like languages.--   Unlike 'parseXIDIdentStrict', this will also accept an--   underscore as leading character, which corresponds more--   closely to programming languages like C and Java, but--   deviates somewhat from the--   <http://unicode.org/reports/tr31/ Unicode Identifier and--   Pattern Syntax standard>.parseXIDIdentGeneral :: Parser TextparseXIDIdentGeneral = T.pack <$> ((:) <$> (hasCat xidStart <|> char '_')                                       <*> many (hasCat xidContinue))-- | A helper function for defining parsers for arbitrary-base integers.--   The first argument will be the base, and the second will be the--   parser for the individual digits.number :: Integer -> Parser Char -> Parser Integernumber base digits = foldl go 0 <$> many1 digits  where go x d = base * x + toInteger (value d)        value c          | c >= 'a' && c <= 'z' = 0xa + (fromEnum c - fromEnum 'a')          | c >= 'A' && c <= 'Z' = 0xa + (fromEnum c - fromEnum 'A')          | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'          | c == '\x218a' = 0xa          | c == '\x218b' = 0xb          | otherwise = error ("Unknown letter in number: " ++ show c)digitsFor :: Int -> [Char]digitsFor n  | n <= 10   = take n ['0'..'9']  | n <= 36   = take (n-10) ['A'..'Z'] ++ take (n-10) ['a'..'z'] ++ ['0'..'9']  | otherwise = error ("Invalid base for parser: " ++ show n)anyBase :: Integer -> Parser IntegeranyBase n = number n (oneOf (digitsFor (fromIntegral n)))-- | A parser for Common Lisp's arbitrary-base number syntax, of--   the form @#[base]r[number]@, where the base is given in--   decimal. Note that this syntax begins with a @#@, which--   means it might conflict with defined reader macros.commonLispNumberAnyBase :: Parser IntegercommonLispNumberAnyBase = do  _ <- char '#'  n <- decNumber  guard (n >= 2 && n <= 36)  _ <- char 'r'  signed (anyBase n)-- | A parser for GNU m4's arbitrary-base number syntax, of--   the form @0r[base]:[number]@, where the base is given in--   decimal.gnuM4NumberAnyBase :: Parser IntegergnuM4NumberAnyBase = do  _ <- string "0r"  n <- decNumber  guard (n >= 2 && n <= 36)  _ <- char ':'  signed (anyBase n)sign :: Num a => Parser (a -> a)sign =  (pure id     <* char '+')    <|> (pure negate <* char '-')    <|> pure id-- | Given a parser for some kind of numeric literal, this will attempt to--   parse a leading @+@ or a leading @-@ followed by the numeric literal,--   and if a @-@ is found, negate that literal.signed :: Num a => Parser a -> Parser asigned p = ($) <$> sign <*> p-- | Parses a number in the same way as 'prefixedNumber', with an optional--   leading @+@ or @-@.signedPrefixedNumber :: Parser IntegersignedPrefixedNumber = signed prefixedNumber-- | Parses a number, determining which numeric base to use by examining--   the literal's prefix: @0x@ for a hexadecimal number, @0z@ for a--   dozenal number, @0o@ for an octal number, and @0b@ for a binary--   number (as well as the upper-case versions of the same.) If the--   base is omitted entirely, then it is treated as a decimal number.prefixedNumber :: Parser IntegerprefixedNumber =  (string "0x" <|> string "0X") *> hexNumber              <|> (string "0o" <|> string "0O") *> octNumber              <|> (string "0z" <|> string "0Z") *> dozNumber              <|> (string "0b" <|> string "0B") *> binNumber              <|> decNumber-- | A parser for non-signed binary numbersbinNumber :: Parser IntegerbinNumber = number 2 (char '0' <|> char '1')-- | A parser for signed binary numbers, with an optional leading @+@ or @-@.signedBinNumber :: Parser IntegersignedBinNumber = signed binNumber-- | A parser for non-signed octal numbersoctNumber :: Parser IntegeroctNumber = number 8 (oneOf "01234567")-- | A parser for signed octal numbers, with an optional leading @+@ or @-@.signedOctNumber :: Parser IntegersignedOctNumber = ($) <$> sign <*> octNumber-- | A parser for non-signed decimal numbersdecNumber :: Parser IntegerdecNumber = number 10 digit-- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.signedDecNumber :: Parser IntegersignedDecNumber = ($) <$> sign <*> decNumberdozDigit :: Parser ChardozDigit = digit <|> oneOf "AaBb\x218a\x218b"-- | A parser for non-signed duodecimal (dozenal) numbers. This understands both--   the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)--   and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@--   respectively.dozNumber :: Parser IntegerdozNumber = number 12 dozDigit-- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.signedDozNumber :: Parser IntegersignedDozNumber = ($) <$> sign <*> dozNumber-- | A parser for non-signed hexadecimal numbershexNumber :: Parser IntegerhexNumber = number 16 hexDigit-- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.signedHexNumber :: Parser IntegersignedHexNumber = ($) <$> sign <*> hexNumber-- |data Location = Span !SourcePos !SourcePos  deriving (Eq, Ord, Show)-- | Add support for source locations while parsing S-expressions, as described in this--   <https://www.reddit.com/r/haskell/comments/4x22f9/labelling_ast_nodes_with_locations/d6cmdy9/ Reddit>-- thread.data Located a = At !Location a  deriving (Eq, Ord, Show)-- | Adds a source span to a parser.located :: Parser a -> Parser (Located a)located parser = do  begin <- getPosition  result <- parser  end <- getPosition  return $ At (Span begin end) result-- | A default location valuedLocation :: LocationdLocation = Span dPos dPos  where dPos = newPos "" 0 0{- $introThis module contains a selection of parsers for different kinds ofidentifiers and literals, from which more elaborate parsers can beassembled. These can afford the user a quick way of building parsersfor different atom types.-}
 |