{-# LANGUAGE
DeriveGeneric, FlexibleInstances, MultiParamTypeClasses,
OverloadedStrings, TypeFamilies, TypeOperators
#-}
module GardGround.Utils.SteParser.Lex (
Text,
Ident(..),
Parser',
ParserEnv(..),
Span(..),
HandleTree(..),
emptyIdent,
nullIdent,
-- creating and running parsers
makeParseEnv,
parseFile,
-- parser combinators
takeUntil,
takeWhile,
takeIdent,
takeNatural,
takeWithProperty,
skipWhiteSpace,
tryOne,
eats,
recordSpan,
) where
import Prelude hiding (takeWhile, span, splitAt)
import Control.Applicative (Alternative(..), (<|>))
import Control.DeepSeq (NFData)
import Control.Monad (guard)
import Control.Monad.State.Strict
import Data.Char (isDigit, isHexDigit, digitToInt)
import Data.Hashable (Hashable)
import qualified Data.ByteString as Bb
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.UTF8 as B
import qualified Data.HashMap.Strict as H
import qualified Data.String.UTF8 as U
import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.ICU.Char as IC
import qualified Data.Text.ICU.Normalize2 as IN
import GardGround.Utils.SteParser
import Generic.Data
import Numeric.Natural (Natural)
-- | A 0-indexed, half-open interval of integers, defined by start & end indices
data Span = Span
{ start :: {-# UNPACK #-} !Int
, end :: {-# UNPACK #-} !Int
}
deriving (Eq, Generic, Ord, Show)
instance Hashable Span
instance NFData Span
instance Semigroup Span where
Span start1 end1 <> Span start2 end2 = Span (min start1 start2) (max end1 end2)
type Text = U.UTF8 B.ByteString
data Ident = Ident !Span !Text
-- Ident equality ignores spans
instance Eq Ident where
Ident _ t1 == Ident _ t2 = (t1 == t2)
emptyIdent :: Int -> Ident
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
nullIdent :: Ident -> Bool
nullIdent (Ident _ t) = Bb.null $ U.toRep t
instance Show Ident where
show (Ident _ t) = B.toString $ U.toRep t
data ParserEnv = ParserEnv {
peOffset :: !Int,
peText :: Text
}
type Parser' e a = Parser ParserEnv e a
makeParseEnv :: B.ByteString -> ParserEnv
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
parseFile :: String -> Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
-- some simple parser combinators
shiftEnv :: (Int, Text) -> Parser' e ()
shiftEnv (ll, r) = Parser $ modify go
where
go :: ParserEnv -> ParserEnv
go st = st { peOffset = (peOffset st) + ll, peText = r }
slen :: Text -> Int
slen = C.length . U.toRep
takeUntil :: (Char -> Bool) -> Parser' e Text
takeUntil f = takeWhile (not . f)
takeWhile :: (Char -> Bool) -> Parser' e Text
takeWhile f = do
env <- get
let (l, r) = U.span f (peText env)
shiftEnv (slen l, r)
return l
takeWithProperty :: IC.Bool_ -> Parser' e Text
takeWithProperty p = takeWhile $ IC.property p
takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
takeIdent = do
env <- get
let start_ = peOffset env
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> okNone
Just (fi, r2) -> (
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
`seq`
(if IC.property IC.XidStart fi then (
do
shiftEnv (slen l, r)
rest <- takeWithProperty IC.XidContinue
end_ <- gets peOffset
let thlw = theHorrorsLieWithin fi rest
let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
return (fmap (Just . mkident) thlw)
) else okNone))
where
okNone :: Parser' e (Either e2 (Maybe dt))
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
-- I hate this...
theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
theHorrorsLieWithin fi rest =
-- make a byte string from the parts of the identifyer
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
-- finish a UTF-8 decoded identifier
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
-- handle decoding
fmap fini $ TE.decodeUtf8' iabs
skipWhiteSpace :: Parser' e ()
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
-- | try to parse the first character as a token
tryOne :: (Char -> Maybe tok) -> Parser' e (Maybe tok)
tryOne f = do
env <- get
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> return Nothing
Just (fi, r2) ->
(if not ((slen r2) == 0) then error "tryOne : unable to iterator over characters" else ())
`seq`
case f fi of
Nothing -> return Nothing
Just x -> do
shiftEnv (slen l, r)
return . Just $ x
-- | A case tree for `eats`
data HandleTree x =
HandleTree (H.HashMap Char (HandleTree x)) (Maybe x)
| Htleaf !x
-- | try to recognize specific tokens
eats :: HandleTree x -> Parser' e (Maybe x)
eats ht = Parser . StateT $ \st ->
-- do backtracking in case of failure
let Parser (StateT eatsfun) = eats_ ht in
-- fmap @ Either
fmap (\val -> case val of
(Nothing, _) -> (Nothing, st)
(Just x, st2) -> (Just x, st2)
) $ eatsfun st
where
eatOne hm xc = tryOne $ \c -> (H.lookup c hm) <|> (fmap Htleaf xc)
eats_ :: HandleTree x -> Parser' e (Maybe x)
eats_ (Htleaf x) = pure (Just x)
eats_ (HandleTree hm xc) = (eatOne hm xc) >>= \x -> case x of
Nothing -> pure Nothing
Just y -> eats_ y
-- | record the span of the inner parser
recordSpan :: Parser' e a -> Parser' e (Span, a)
recordSpan inner = do
start_ <- gets peOffset
x <- inner
end_ <- gets peOffset
pure (Span { start = start_, end = end_ }, x)
takeNatural :: (s ~ ParserEnv, ParserError s e) => Parser' e (Maybe (Span, Natural))
takeNatural = (fmap (\(sp, i) -> fmap (\j -> (sp, j)) i)) . recordSpan $ do
fi' <- tryOne $ chkThenNat isDigit
case fi' of
Just 0 ->
fmap Just $ handleHex <|> do
se' <- tryOne $ chkThenNat isDigit
-- invalid number format
guard (se' == Nothing)
return 0
Just d -> fmap Just $ handleDigits 10 isDigit d
Nothing -> return Nothing
where
charToNat :: Char -> Natural
charToNat = fromInteger . toInteger . digitToInt
chkThenNat :: (Char -> Bool) -> Char -> Maybe Natural
chkThenNat chk c = if chk c then Just $ charToNat c else Nothing
-- this parser can't fail
handleDigits :: Natural -> (Char -> Bool) -> Natural -> Parser' e Natural
handleDigits mult chk acc' = go acc'
where
go acc = do
hd' <- tryOne $ chkThenNat chk
case hd' of
Just hd -> go (mult * acc + hd)
Nothing -> pure acc
handleHex = (tryOne $ \c -> if c == 'x' then Just () else Nothing) >>= (\x ->
(guard (x == Just ())) >> (handleDigits 16 isHexDigit 0)
)