+ {-# LANGUAGE
+ DeriveGeneric, FlexibleInstances, MultiParamTypeClasses,
+ OverloadedStrings, TypeFamilies, TypeOperators
+ #-}
+
+ module GardGround.Utils.SteParser.Lex (
+ Text,
+ Ident(..),
+ Parser',
+ ParserEnv(..),
+ Span(..),
+ HandleTree(..),
+ emptyIdent,
+ nullIdent,
+
+ -- creating and running parsers
+ makeParseEnv,
+ parseFile,
+
+ -- parser combinators
+ takeUntil,
+ takeWhile,
+ takeIdent,
+ takeNatural,
+ takeWithProperty,
+ skipWhiteSpace,
+ tryOne,
+ eats,
+ recordSpan,
+ ) where
+
+ import Prelude hiding (takeWhile, span, splitAt)
+ import Control.Applicative (Alternative(..), (<|>))
+ import Control.DeepSeq (NFData)
+ import Control.Monad (guard)
+ import Control.Monad.State.Strict
+ import Data.Char (isDigit, isHexDigit, digitToInt)
+ import Data.Hashable (Hashable)
+ import qualified Data.ByteString as Bb
+ import qualified Data.ByteString.Char8 as C
+ import qualified Data.ByteString.UTF8 as B
+ import qualified Data.HashMap.Strict as H
+ import qualified Data.String.UTF8 as U
+ import qualified Data.Text.Encoding as TE
+ import Data.Text.Encoding.Error (UnicodeException)
+ import qualified Data.Text.ICU.Char as IC
+ import qualified Data.Text.ICU.Normalize2 as IN
+ import GardGround.Utils.SteParser
+ import Generic.Data
+ import Numeric.Natural (Natural)
+
+ -- | A 0-indexed, half-open interval of integers, defined by start & end indices
+ data Span = Span
+ { start :: {-# UNPACK #-} !Int
+ , end :: {-# UNPACK #-} !Int
+ }
+ deriving (Eq, Generic, Ord, Show)
+
+ instance Hashable Span
+ instance NFData Span
+
+ instance Semigroup Span where
+ Span start1 end1 <> Span start2 end2 = Span (min start1 start2) (max end1 end2)
+
+ type Text = U.UTF8 B.ByteString
+
+ data Ident = Ident !Span !Text
+
+ -- Ident equality ignores spans
+ instance Eq Ident where
+ Ident _ t1 == Ident _ t2 = (t1 == t2)
+
+ emptyIdent :: Int -> Ident
+ emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
+
+ nullIdent :: Ident -> Bool
+ nullIdent (Ident _ t) = Bb.null $ U.toRep t
+
+ instance Show Ident where
+ show (Ident _ t) = B.toString $ U.toRep t
+
+ data ParserEnv = ParserEnv {
+ peOffset :: !Int,
+ peText :: Text
+ }
+
+ type Parser' e a = Parser ParserEnv e a
+
+ makeParseEnv :: B.ByteString -> ParserEnv
+ makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
+
+ parseFile :: String -> Parser' e a -> IO (Either e a)
+ parseFile f p = do
+ contents <- C.readFile f
+ return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
+
+ -- some simple parser combinators
+
+ shiftEnv :: (Int, Text) -> Parser' e ()
+ shiftEnv (ll, r) = Parser $ modify go
+ where
+ go :: ParserEnv -> ParserEnv
+ go st = st { peOffset = (peOffset st) + ll, peText = r }
+
+ slen :: Text -> Int
+ slen = C.length . U.toRep
+
+ takeUntil :: (Char -> Bool) -> Parser' e Text
+ takeUntil f = takeWhile (not . f)
+
+ takeWhile :: (Char -> Bool) -> Parser' e Text
+ takeWhile f = do
+ env <- get
+ let (l, r) = U.span f (peText env)
+ shiftEnv (slen l, r)
+ return l
+
+ takeWithProperty :: IC.Bool_ -> Parser' e Text
+ takeWithProperty p = takeWhile $ IC.property p
+
+ takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
+ takeIdent = do
+ env <- get
+ let start_ = peOffset env
+ let (l, r) = U.splitAt 1 (peText env)
+ case U.uncons l of
+ Nothing -> okNone
+ Just (fi, r2) -> (
+ (if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
+ `seq`
+ (if IC.property IC.XidStart fi then (
+ do
+ shiftEnv (slen l, r)
+ rest <- takeWithProperty IC.XidContinue
+ end_ <- gets peOffset
+ let thlw = theHorrorsLieWithin fi rest
+ let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
+ return (fmap (Just . mkident) thlw)
+ ) else okNone))
+
+ where
+ okNone :: Parser' e (Either e2 (Maybe dt))
+ okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
+
+ -- I hate this...
+ theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
+ theHorrorsLieWithin fi rest =
+ -- make a byte string from the parts of the identifyer
+ let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
+ -- finish a UTF-8 decoded identifier
+ let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
+ -- handle decoding
+ fmap fini $ TE.decodeUtf8' iabs
+
+ skipWhiteSpace :: Parser' e ()
+ skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
+
+ -- | try to parse the first character as a token
+ tryOne :: (Char -> Maybe tok) -> Parser' e (Maybe tok)
+ tryOne f = do
+ env <- get
+ let (l, r) = U.splitAt 1 (peText env)
+ case U.uncons l of
+ Nothing -> return Nothing
+ Just (fi, r2) ->
+ (if not ((slen r2) == 0) then error "tryOne : unable to iterator over characters" else ())
+ `seq`
+ case f fi of
+ Nothing -> return Nothing
+ Just x -> do
+ shiftEnv (slen l, r)
+ return . Just $ x
+
+ -- | A case tree for `eats`
+ data HandleTree x =
+ HandleTree (H.HashMap Char (HandleTree x)) (Maybe x)
+ | Htleaf !x
+
+ -- | try to recognize specific tokens
+ eats :: HandleTree x -> Parser' e (Maybe x)
+ eats ht = Parser . StateT $ \st ->
+ -- do backtracking in case of failure
+ let Parser (StateT eatsfun) = eats_ ht in
+ -- fmap @ Either
+ fmap (\val -> case val of
+ (Nothing, _) -> (Nothing, st)
+ (Just x, st2) -> (Just x, st2)
+ ) $ eatsfun st
+
+ where
+ eatOne hm xc = tryOne $ \c -> (H.lookup c hm) <|> (fmap Htleaf xc)
+ eats_ :: HandleTree x -> Parser' e (Maybe x)
+ eats_ (Htleaf x) = pure (Just x)
+ eats_ (HandleTree hm xc) = (eatOne hm xc) >>= \x -> case x of
+ Nothing -> pure Nothing
+ Just y -> eats_ y
+
+ -- | record the span of the inner parser
+ recordSpan :: Parser' e a -> Parser' e (Span, a)
+ recordSpan inner = do
+ start_ <- gets peOffset
+ x <- inner
+ end_ <- gets peOffset
+ pure (Span { start = start_, end = end_ }, x)
+
+ takeNatural :: (s ~ ParserEnv, ParserError s e) => Parser' e (Maybe (Span, Natural))
+ takeNatural = (fmap (\(sp, i) -> fmap (\j -> (sp, j)) i)) . recordSpan $ do
+ fi' <- tryOne $ chkThenNat isDigit
+ case fi' of
+ Just 0 ->
+ fmap Just $ handleHex <|> do
+ se' <- tryOne $ chkThenNat isDigit
+ -- invalid number format
+ guard (se' == Nothing)
+ return 0
+
+ Just d -> fmap Just $ handleDigits 10 isDigit d
+ Nothing -> return Nothing
+
+ where
+ charToNat :: Char -> Natural
+ charToNat = fromInteger . toInteger . digitToInt
+
+ chkThenNat :: (Char -> Bool) -> Char -> Maybe Natural
+ chkThenNat chk c = if chk c then Just $ charToNat c else Nothing
+
+ -- this parser can't fail
+ handleDigits :: Natural -> (Char -> Bool) -> Natural -> Parser' e Natural
+ handleDigits mult chk acc' = go acc'
+ where
+ go acc = do
+ hd' <- tryOne $ chkThenNat chk
+ case hd' of
+ Just hd -> go (mult * acc + hd)
+ Nothing -> pure acc
+
+ handleHex = (tryOne $ \c -> if c == 'x' then Just () else Nothing) >>= (\x ->
+ (guard (x == Just ())) >> (handleDigits 16 isHexDigit 0)
+ )