module Data.SCargot.Common ( number
                           , decNumber
                           , hexNumber
                           , octNumber
                           , sign
                           -- * Lisp Identifier Syntaxes
                           , parseR5RSIdent
                           , parseR6RSIdent
                           , parseR7RSIdent
                           ) where

import           Data.Char
import           Data.Text (Text)
import qualified Data.Text as T
import           Text.Parsec
import           Text.Parsec.Char (satisfy)
import           Text.Parsec.Text (Parser)

-- | Parse an identifier according to the R5RS Scheme standard. This
--   will not normalize case, even though the R5RS standard specifies
--   that all identifiers be normalized to lower case first.
--
--   An R5RS identifier is, broadly speaking, alphabetic or numeric
--   and may include various symbols, but no escapes.
parseR5RSIdent :: Parser Text
parseR5RSIdent =
  T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  where initial    = letter <|> oneOf "!$%&*/:<=>?^_~"
        subsequent = initial <|> digit <|> oneOf "+-.@"
        peculiar   = string "+" <|> string "-" <|> string "..."

hasCategory :: Char -> [GeneralCategory] -> Bool
hasCategory c cs = generalCategory c `elem` cs

-- | Parse an identifier according to the R6RS Scheme standard. An
--   R6RS identifier may include inline hexadecimal escape sequences
--   so that, for example, 'foo' is equivalent to 'f\x6f;o', and is
--   more liberal than R5RS as to which Unicode characters it may
--   accept.
parseR6RSIdent :: Parser Text
parseR6RSIdent =
  T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
        constituent = letter
                   <|> uniClass (\ c -> isLetter c ||
                                        isSymbol c ||
                                        hasCategory c
                                          [ NonSpacingMark
                                          , LetterNumber
                                          , OtherNumber
                                          , DashPunctuation
                                          , ConnectorPunctuation
                                          , OtherPunctuation
                                          , PrivateUse
                                          ])
        inlineHex   = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
        subsequent  = initial <|> digit <|> oneOf "+-.@"
                   <|> uniClass (\ c -> hasCategory c
                                          [ DecimalNumber
                                          , SpacingCombiningMark
                                          , EnclosingMark
                                          ])
        peculiar    = string "+" <|> string "-" <|> string "..." <|>
                      ((++) <$> string "->" <*> many subsequent)
        uniClass :: (Char -> Bool) -> Parser Char
        uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)

-- | Parse an identifier according to the R7RS Scheme standard. An
--   R7RS identifier, in addition to a typical identifier format,
--   can also be a chunk of text surrounded by vertical bars that
--   can contain spaces and other characters. Unlike R6RS, it does
--   not allow escapes to be included in identifiers that are not
--   surrounded by vertical bars.
parseR7RSIdent :: Parser Text
parseR7RSIdent =  T.pack <$>
          (  (:) <$> initial <*> many subsequent
         <|> char '|' *> many1 symbolElement <* char '|'
         <|> peculiar
          )
  where initial = letter <|> specInit
        specInit = oneOf "!$%&*/:<=>?^_~"
        subsequent = initial <|> digit <|> specSubsequent
        specSubsequent = expSign <|> oneOf ".@"
        expSign = oneOf "+-"
        symbolElement =  noneOf "\\|"
                     <|> hexEscape
                     <|> mnemEscape
                     <|> ('|' <$ string "\\|")
        hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
        mnemEscape =  '\a' <$ string "\\a"
                  <|> '\b' <$ string "\\b"
                  <|> '\t' <$ string "\\t"
                  <|> '\n' <$ string "\\n"
                  <|> '\r' <$ string "\\r"
        peculiar =  (:[]) <$> expSign
                <|> cons2 <$> expSign <*> signSub <*> many subsequent
                <|> cons3 <$> expSign
                          <*> char '.'
                          <*> dotSub
                          <*> many subsequent
                <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
        dotSub = signSub <|> char '.'
        signSub = initial <|> expSign <|> char '@'
        cons2 a b cs   = a : b : cs
        cons3 a b c ds = a : b : c : ds

-- | A helper function for defining parsers for arbitrary-base integers.
--   The first argument will be the base, and the second will be the
--   parser for the individual digits.
number :: Integer -> Parser Char -> Parser Integer
number base digits = foldl go 0 <$> many1 digits
  where go x d = base * x + toInteger (value d)
        value c
          | c == 'a' || c == 'A' = 0xa
          | c == 'b' || c == 'B' = 0xb
          | c == 'c' || c == 'C' = 0xc
          | c == 'd' || c == 'D' = 0xd
          | c == 'e' || c == 'E' = 0xe
          | c == 'f' || c == 'F' = 0xf
          | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
          | otherwise = error ("Unknown letter in number: " ++ show c)

-- | A parser for bare binary numbers
binNumber :: Parser Integer
binNumber = number 2 (char '0' <|> char '1')

-- | A parser for bare octal numbers
octNumber :: Parser Integer
octNumber = number 8 digit

-- | A parser for bare decimal numbers
decNumber :: Parser Integer
decNumber = number 10 digit

-- | A parser for bare hexadecimal numbers
hexNumber :: Parser Integer
hexNumber = number 16 hexDigit

-- | A parser for numeric signs, represented as a function from numbers
--   to numbers. It will parse '+' as the identity function, '-', as
--   'negate', or consume no input and return the identity function.
--   This can be combined with other numeric literals to implement
--   signedness:
--
--   > myNum = go <$> sign <*> decNumber
--   >   where go s n = s n
sign :: Num a => Parser (a -> a)
sign =  (pure id     <* char '+')
    <|> (pure negate <* char '-')
    <|> pure id