fogti/gardground: core/lib/GardGround/Utils/SteParser/Lex.hs

{-# LANGUAGE
        DeriveGeneric, FlexibleInstances, MultiParamTypeClasses,
        OverloadedStrings, TypeFamilies, TypeOperators
  #-}

module GardGround.Utils.SteParser.Lex (
    Text,
    Ident(..),
    Parser',
    ParserEnv(..),
    Span(..),
    HandleTree(..),
    emptyIdent,
    nullIdent,

    -- creating and running parsers
    makeParseEnv,
    parseFile,

    -- parser combinators
    takeUntil,
    takeWhile,
    takeIdent,
    takeNatural,
    takeWithProperty,
    skipWhiteSpace,
    tryOne,
    eats,
    recordSpan,

    -- concrete parsers
    skipComments,
) where

import Prelude hiding (takeWhile, span, splitAt)
import Control.Applicative (Alternative(..), (<|>))
import Control.DeepSeq (NFData)
import Control.Monad (guard)
import Control.Monad.State.Strict
import Data.Char (isDigit, isHexDigit, digitToInt)
import Data.Hashable (Hashable)
import Data.Maybe (isJust)
import qualified Data.ByteString as Bb
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.UTF8 as B
import qualified Data.HashMap.Strict as H
import qualified Data.String.UTF8 as U
import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.ICU.Char as IC
import qualified Data.Text.ICU.Normalize2 as IN
import GardGround.Utils.SteParser
import Generic.Data
import Numeric.Natural (Natural)

-- | A 0-indexed, half-open interval of integers, defined by start & end indices
data Span = Span
  { start :: {-# UNPACK #-} !Natural
  , end   :: {-# UNPACK #-} !Natural
  }
  deriving (Eq, Generic, Ord, Show)

instance Hashable Span
instance NFData   Span

instance Semigroup Span where
  Span start1 end1 <> Span start2 end2 = Span (min start1 start2) (max end1 end2)

type Text = U.UTF8 B.ByteString

data Ident = Ident !Span !Text

-- Ident equality ignores spans
instance Eq Ident where
  Ident _ t1 == Ident _ t2 = (t1 == t2)

emptyIdent :: Natural -> Ident
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)

nullIdent :: Ident -> Bool
nullIdent (Ident _ t) = Bb.null $ U.toRep t

instance Show Ident where
    show (Ident _ t) = B.toString $ U.toRep t

data ParserEnv = ParserEnv {
        peOffset :: !Natural,
        peText   :: Text
    }

type Parser' e a = Parser ParserEnv e a

makeParseEnv :: B.ByteString -> ParserEnv
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }

parseFile :: String -> Parser' e a -> IO (Either e a)
parseFile f p = do
  contents <- C.readFile f
  return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)

-- some simple parser combinators

shiftEnv :: (Natural, Text) -> Parser' e ()
shiftEnv (ll, r) = Parser $ modify go
  where
    go :: ParserEnv -> ParserEnv
    go st = st { peOffset = (peOffset st) + ll, peText = r }

slen :: Text -> Natural
slen = toEnum . C.length . U.toRep

takeUntil :: (Char -> Bool) -> Parser' e Text
takeUntil f = takeWhile (not . f)

takeWhile :: (Char -> Bool) -> Parser' e Text
takeWhile f = do
  env <- get
  let (l, r) = U.span f (peText env)
  shiftEnv (slen l, r)
  return l

takeWithProperty :: IC.Bool_ -> Parser' e Text
takeWithProperty p = takeWhile $ IC.property p

takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
takeIdent = do
  env <- get
  let start_ = peOffset env
  let (l, r) = U.splitAt 1 (peText env)
  case U.uncons l of
    Nothing -> okNone
    Just (fi, r2) -> (
      (if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
      `seq`
      (if IC.property IC.XidStart fi then (
        do
          shiftEnv (slen l, r)
          rest <- takeWithProperty IC.XidContinue
          end_ <- gets peOffset
          let thlw = theHorrorsLieWithin fi rest
          let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
          return (fmap (Just . mkident) thlw)
      ) else okNone))

  where
    okNone :: Parser' e (Either e2 (Maybe dt))
    okNone = Parser . StateT $ \st -> Right (Right Nothing, st)

    -- I hate this...
    theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
    theHorrorsLieWithin fi rest =
      -- make a byte string from the parts of the identifyer
      let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
      -- finish a UTF-8 decoded identifier
      let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
      -- handle decoding
      fmap fini $ TE.decodeUtf8' iabs

skipWhiteSpace :: Parser' e ()
skipWhiteSpace = fmap (const ()) $ takeWithProperty IC.WhiteSpace

-- | try to parse the first character as a token
tryOne :: (Char -> Maybe tok) -> Parser' e (Maybe tok)
tryOne f = do
  env <- get
  let (l, r) = U.splitAt 1 (peText env)
  case U.uncons l of
    Nothing -> pure Nothing
    Just (fi, r2) ->
      (if not ((slen r2) == 0) then error "tryOne : unable to iterator over characters" else ())
      `seq`
        do
          let fres = f fi
          (if isJust fres then shiftEnv (slen l, r) else pure ())
          pure fres

-- | A case tree for `eats`
data HandleTree x =
    HandleTree (H.HashMap Char (HandleTree x)) (Maybe x)
  | Htleaf !x

-- | try to recognize specific tokens
eats :: HandleTree x -> Parser' e (Maybe x)
eats ht = Parser . StateT $ \st ->
    -- do backtracking in case of failure
    let Parser (StateT eatsfun) = eats_ ht in
    -- fmap @ Either
    fmap (\val -> case val of
      (Nothing, _) -> (Nothing, st)
      (Just x, st2) -> (Just x, st2)
    ) $ eatsfun st

  where
    eatOne hm xc = tryOne $ \c -> (H.lookup c hm) <|> (fmap Htleaf xc)
    eats_ :: HandleTree x -> Parser' e (Maybe x)
    eats_ (Htleaf x) = pure (Just x)
    eats_ (HandleTree hm xc) = (eatOne hm xc) >>= \x -> case x of
      Nothing -> pure Nothing
      Just y ->  eats_ y

-- | record the span of the inner parser
recordSpan :: Parser' e a -> Parser' e (Span, a)
recordSpan inner = do
  start_ <- gets peOffset
  x      <- inner
  end_   <- gets peOffset
  pure (Span { start = start_, end = end_ }, x)

takeNatural :: (s ~ ParserEnv, ParserError s e) => Parser' e (Maybe (Span, Natural))
takeNatural = (fmap (\(sp, i) -> fmap (\j -> (sp, j)) i)) . recordSpan $ do
  fi' <- tryOne $ chkThenNat isDigit
  case fi' of
    Just 0 ->
      fmap Just $ handleHex <|> do
        se' <- tryOne $ chkThenNat isDigit
        -- invalid number format
        guard (se' == Nothing)
        return 0

    Just d -> fmap Just $ handleDigits 10 isDigit d
    Nothing -> return Nothing

  where
    charToNat :: Char -> Natural
    charToNat = fromInteger . toInteger . digitToInt

    chkThenNat :: (Char -> Bool) -> Char -> Maybe Natural
    chkThenNat chk c = if chk c then Just $ charToNat c else Nothing

    -- this parser can't fail
    handleDigits :: Natural -> (Char -> Bool) -> Natural -> Parser' e Natural
    handleDigits mult chk acc' = go acc'
      where
        go acc = do
          hd' <- tryOne $ chkThenNat chk
          case hd' of
            Just hd -> go (mult * acc + hd)
            Nothing -> pure acc

    handleHex = (tryOne $ \c -> if c == 'x' then Just () else Nothing) >>= (\x ->
        (guard (x == Just ())) >> (handleDigits 16 isHexDigit 0)
      )

-- | usually called as `skipComments 0`, skips over comments delimited by `(*` and `*)`
skipComments :: Integer -> Parser' e ()
skipComments (-1) = error "lexeComments invalid index"

skipComments lvl = do
  (if lvl <= 0 then skipWhiteSpace else pure ())
  fi <- eats (if lvl <= 0 then lexeCommentsTreeInit else lexeCommentsTree)
  case fi of
    Just ld -> skipComments $ lvl + ld
    -- we either can abort searching for comment contents (lvl == 0)
    -- or we skip over a single character and continue
    Nothing -> if lvl <= 0 then pure () else tryOne (\_ -> Just ()) >> skipComments lvl

  where
    lexeCommentsTreeInit :: HandleTree Integer
    lexeCommentsTreeInit = HandleTree (H.fromList l1) Nothing
      where
        l1 = [('(' {- ) -}, HandleTree (H.fromList l2) Nothing)]
        l2 = [('*',         Htleaf 1)]

    lexeCommentsTree :: HandleTree Integer
    lexeCommentsTree = HandleTree (H.fromList l1) Nothing
      where
        -- (* ... comment start; *) ... comment end
        l1 = [('(' {- ) -}, HandleTree (H.fromList l2) Nothing)
             ,('*',         HandleTree (H.fromList l3) Nothing)]
        l2 = [('*',         Htleaf 1)]
        l3 = [({- ( -} ')', Htleaf (-1))]