Parcourir la source

Added a few new identifier syntaxes

Getty Ritter il y a 8 ans
Parent
commit
2520c10063
1 fichiers modifiés avec 93 ajouts et 1 suppressions
  1. 93 1
      Data/SCargot/Common.hs

+ 93 - 1
Data/SCargot/Common.hs

@@ -1,8 +1,13 @@
 module Data.SCargot.Common ( -- $intro
-                           -- * Lisp Identifier Syntaxes
+                           -- * Identifier Syntaxes
                              parseR5RSIdent
                            , parseR6RSIdent
                            , parseR7RSIdent
+                           , parseXIDIdentStrict
+                           , parseXIDIdentGeneral
+                           , parseHaskellIdent
+                           , parseHaskellVariable
+                           , parseHaskellConstructor
                              -- * Numeric Literal Parsers
                            , signed
                            , prefixedNumber
@@ -116,6 +121,93 @@ parseR7RSIdent =  T.pack <$>
         cons2 a b cs   = a : b : cs
         cons3 a b c ds = a : b : c : ds
 
+-- | Parse a Haskell variable identifier: a sequence of alphanumeric
+--   characters, underscores, or single quote that begins with a
+--   lower-case letter.
+parseHaskellVariable :: Parser Text
+parseHaskellVariable =
+  T.pack <$> ((:) <$> small <*> many (small <|>
+                                      large <|>
+                                      digit' <|>
+                                      char '\'' <|>
+                                      char '_'))
+  where small = satisfy isLower
+        large = satisfy isUpper
+        digit' = satisfy isDigit
+
+-- | Parse a Haskell constructor: a sequence of alphanumeric
+--   characters, underscores, or single quote that begins with an
+--   upper-case letter.
+parseHaskellConstructor :: Parser Text
+parseHaskellConstructor =
+  T.pack <$> ((:) <$> large <*> many (small <|>
+                                      large <|>
+                                      digit' <|>
+                                      char '\'' <|>
+                                      char '_'))
+  where small = satisfy isLower
+        large = satisfy isUpper
+        digit' = satisfy isDigit
+
+-- | Parse a Haskell identifer: a sequence of alphanumeric
+--   characters, underscores, or a single quote. This matches both
+--   variable and constructor names.
+parseHaskellIdent :: Parser Text
+parseHaskellIdent =
+  T.pack <$> ((:) <$> (large <|> small)
+                  <*> many (small <|>
+                            large <|>
+                            digit' <|>
+                            char '\'' <|>
+                            char '_'))
+  where small = satisfy isLower
+        large = satisfy isUpper
+        digit' = satisfy isDigit
+
+-- Ensure that a given character has the given Unicode category
+hasCat :: [GeneralCategory] -> Parser Char
+hasCat cats = satisfy (flip hasCategory cats)
+
+xidStart :: [GeneralCategory]
+xidStart = [ UppercaseLetter
+           , LowercaseLetter
+           , TitlecaseLetter
+           , ModifierLetter
+           , OtherLetter
+           , LetterNumber
+           ]
+
+xidContinue :: [GeneralCategory]
+xidContinue = xidStart ++ [ NonSpacingMark
+                          , SpacingCombiningMark
+                          , DecimalNumber
+                          , ConnectorPunctuation
+                          ]
+
+-- | Parse an identifier of unicode characters of the form
+--   @<XID_Start> <XID_Continue>*@, which corresponds strongly
+--   to the identifiers found in most C-like languages. Note that
+--   the @XID_Start@ category does not include the underscore,
+--   so @__foo@ is not a valid XID identifier. To parse
+--   identifiers that may include leading underscores, use
+--   'parseXIDIdentGeneral'.
+parseXIDIdentStrict :: Parser Text
+parseXIDIdentStrict = T.pack <$> ((:) <$> hasCat xidStart
+                                  <*> many (hasCat xidContinue))
+
+-- | Parse an identifier of unicode characters of the form
+--   @(<XID_Start> | '_') <XID_Continue>*@, which corresponds
+--   strongly to the identifiers found in most C-like languages.
+--   Unlike 'parseXIDIdentStrict', this will also accept an
+--   underscore as leading character, which corresponds more
+--   closely to programming languages like C and Java, but
+--   deviates somewhat from the
+--   <http://unicode.org/reports/tr31/ Unicode Identifier and
+--   Pattern Syntax standard>.
+parseXIDIdentGeneral :: Parser Text
+parseXIDIdentGeneral = T.pack <$> ((:) <$> (hasCat xidStart <|> char '_')
+                                       <*> many (hasCat xidContinue))
+
 -- | A helper function for defining parsers for arbitrary-base integers.
 --   The first argument will be the base, and the second will be the
 --   parser for the individual digits.