Skip to content

Commit

Permalink
Run the lexer on Text
Browse files Browse the repository at this point in the history
  • Loading branch information
glguy committed Feb 20, 2024
1 parent e7f4d3f commit 4c547ce
Show file tree
Hide file tree
Showing 13 changed files with 74 additions and 59 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ to ensure that its code typechecks and stays in sync with the rest of the packag

```haskell
{-# Language OverloadedStrings #-}
import Data.Text (Text)
import GHC.Generics (Generic)
import QuoteStr (quoteStr)
import Test.Hspec (Spec, hspec, it, shouldBe)
Expand All @@ -59,7 +60,7 @@ main = hspec (parses >> decodes >> encodes)
Consider this sample TOML text from the TOML specification.

```haskell
fruitStr :: String
fruitStr :: Text
fruitStr = [quoteStr|
```

Expand Down
7 changes: 4 additions & 3 deletions src/Toml.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ module Toml (
DocClass(..),
) where

import Data.Text (Text)
import Toml.FromValue (FromValue (fromValue), Result(..))
import Toml.FromValue.Matcher (runMatcher)
import Toml.Located (Located(..))
Expand All @@ -49,7 +50,7 @@ import Toml.ToValue (ToTable (toTable))
import Toml.Value

-- | Parse a TOML formatted 'String' or report an error message.
parse' :: String -> Either String (Table' Position)
parse' :: Text -> Either String (Table' Position)
parse' str =
case parseRawToml str of
Left e -> Left (prettyLocated e)
Expand All @@ -59,11 +60,11 @@ parse' str =
Right tab -> Right tab

-- | Parse a TOML formatted 'String' or report an error message.
parse :: String -> Either String Table
parse :: Text -> Either String Table
parse = fmap forgetTableAnns . parse'

-- | Use the 'FromValue' instance to decode a value from a TOML string.
decode :: FromValue a => String -> Result String a
decode :: FromValue a => Text -> Result String a
decode str =
case parse' str of
Left e -> Failure [e]
Expand Down
34 changes: 18 additions & 16 deletions src/Toml/Lexer.x
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ This module uses actions and lexical hooks defined in
-}
module Toml.Lexer (Context(..), scanToken, lexValue, Token(..)) where

import Data.Text (Text)
import Data.Text qualified as Text
import Toml.Lexer.Token
import Toml.Lexer.Utils
import Toml.Located
Expand Down Expand Up @@ -139,59 +141,59 @@ $wschar+;

<lstr> {
$literal_char+ { strFrag }
"'" { endStr . fmap (drop 1) }
"'" { endStr . fmap (Text.drop 1) }
}

<bstr> {
$unescaped+ { strFrag }
\" { endStr . fmap (drop 1) }
\" { endStr . fmap (Text.drop 1) }
}

<mllstr> {
@mll_content+ { strFrag }
"'" {1,2} { strFrag }
"'" {3,5} { endStr . fmap (drop 3) }
"'" {3,5} { endStr . fmap (Text.drop 3) }
}

<mlbstr> {
@mlb_escaped_nl;
($unescaped | @newline)+ { strFrag }
\" {1,2} { strFrag }
\" {3,5} { endStr . fmap (drop 3) }
\" {3,5} { endStr . fmap (Text.drop 3) }
}

<mlbstr, bstr> {
\\ U $hexdig{8} { unicodeEscape }
\\ U { failure "\\U requires exactly 8 hex digits"}
\\ u $hexdig{4} { unicodeEscape }
\\ u { failure "\\u requires exactly 4 hex digits"}
\\ n { strFrag . ("\n" <$) }
\\ t { strFrag . ("\t" <$) }
\\ r { strFrag . ("\r" <$) }
\\ f { strFrag . ("\f" <$) }
\\ b { strFrag . ("\b" <$) }
\\ \\ { strFrag . ("\\" <$) }
\\ \" { strFrag . ("\"" <$) }
\\ n { strFrag . (Text.singleton '\n' <$) }
\\ t { strFrag . (Text.singleton '\t' <$) }
\\ r { strFrag . (Text.singleton '\r' <$) }
\\ f { strFrag . (Text.singleton '\f' <$) }
\\ b { strFrag . (Text.singleton '\b' <$) }
\\ \\ { strFrag . (Text.singleton '\\' <$) }
\\ \" { strFrag . (Text.singleton '\"' <$) }
$control # [\t\r\n] { recommendEscape }
}

{

type AlexInput = Located String
type AlexInput = Located Text

alexGetByte :: AlexInput -> Maybe (Int, AlexInput)
alexGetByte = locatedUncons

-- | Get the next token from a located string. This function can be total
-- because one of the possible token outputs is an error token.
scanToken :: Context -> Located String -> Either (Located String) (Located Token, Located String)
scanToken :: Context -> Located Text -> Either (Located String) (Located Token, Located Text)
scanToken st str =
case alexScan str (stateInt st) of
AlexEOF -> eofToken st str
AlexError str' -> Left (mkError <$> str')
AlexError str' -> Left (mkError . Text.unpack <$> str')
AlexSkip str' _ -> scanToken st str'
AlexToken str' n action ->
case action (take n <$> str) st of
case action (Text.take n <$> str) st of
Resume st' -> scanToken st' str'
LexerError e -> Left e
EmitToken t -> Right (t, str')
Expand All @@ -206,7 +208,7 @@ stateInt LstrContext {} = lstr
stateInt MlLstrContext{} = mllstr
-- | Lex a single token in a value context. This is mostly useful for testing.
lexValue :: String -> Either String Token
lexValue :: Text -> Either String Token
lexValue str =
case scanToken ValueContext Located{ locPosition = startPos, locThing = str } of
Left e -> Left (locThing e)
Expand Down
42 changes: 21 additions & 21 deletions src/Toml/Lexer/Utils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ import Toml.Located (Located(..))
import Toml.Position (move, Position)

-- | Type of actions associated with lexer patterns
type Action = Located String -> Context -> Outcome
type Action = Located Text -> Context -> Outcome

data Outcome
= Resume Context
Expand All @@ -70,10 +70,10 @@ data Context
= TopContext -- ^ top-level where @[[@ and @]]@ have special meaning
| TableContext -- ^ inline table - lex key names
| ValueContext -- ^ value lexer - lex number literals
| MlBstrContext Position [String] -- ^ multiline basic string: position of opening delimiter and list of fragments
| BstrContext Position [String] -- ^ basic string: position of opening delimiter and list of fragments
| MlLstrContext Position [String] -- ^ multiline literal string: position of opening delimiter and list of fragments
| LstrContext Position [String] -- ^ literal string: position of opening delimiter and list of fragments
| MlBstrContext Position [Text] -- ^ multiline basic string: position of opening delimiter and list of fragments
| BstrContext Position [Text] -- ^ basic string: position of opening delimiter and list of fragments
| MlLstrContext Position [Text] -- ^ multiline literal string: position of opening delimiter and list of fragments
| LstrContext Position [Text] -- ^ literal string: position of opening delimiter and list of fragments
deriving Show

-- | Add a literal fragment of a string to the current string state.
Expand All @@ -88,10 +88,10 @@ strFrag (Located _ s) = \case
-- | End the current string state and emit the string literal token.
endStr :: Action
endStr (Located _ x) = \case
BstrContext p acc -> EmitToken (Located p (TokString (Text.pack (concat (reverse (x : acc))))))
MlBstrContext p acc -> EmitToken (Located p (TokMlString (Text.pack (concat (reverse (x : acc))))))
LstrContext p acc -> EmitToken (Located p (TokString (Text.pack (concat (reverse (x : acc))))))
MlLstrContext p acc -> EmitToken (Located p (TokMlString (Text.pack (concat (reverse (x : acc))))))
BstrContext p acc -> EmitToken (Located p (TokString (Text.concat (reverse (x : acc)))))
MlBstrContext p acc -> EmitToken (Located p (TokMlString (Text.concat (reverse (x : acc)))))
LstrContext p acc -> EmitToken (Located p (TokString (Text.concat (reverse (x : acc)))))
MlLstrContext p acc -> EmitToken (Located p (TokMlString (Text.concat (reverse (x : acc)))))
_ -> error "endStr: panic"

-- | Start a basic string literal
Expand All @@ -113,27 +113,27 @@ startMlLstr (Located p _) _ = Resume (MlLstrContext p [])
-- | Resolve a unicode escape sequence and add it to the current string literal
unicodeEscape :: Action
unicodeEscape (Located p lexeme) ctx =
case readHex (drop 2 lexeme) of
case readHex (drop 2 (Text.unpack lexeme)) of
[(n,_)] | 0xd800 <= n, n < 0xe000 -> LexerError (Located p "non-scalar unicode escape")
| n >= 0x110000 -> LexerError (Located p "unicode escape too large")
| otherwise -> strFrag (Located p [chr n]) ctx
| otherwise -> strFrag (Located p (Text.singleton (chr n))) ctx
_ -> error "unicodeEscape: panic"

recommendEscape :: Action
recommendEscape (Located p x) _ =
LexerError (Located p (printf "control characters must be escaped, use: \\u%04X" (ord (head x))))
LexerError (Located p (printf "control characters must be escaped, use: \\u%04X" (ord (Text.head x))))

-- | Emit a token ignoring the current lexeme
token_ :: Token -> Action
token_ t x _ = EmitToken (t <$ x)

-- | Emit a token using the current lexeme
token :: (String -> Token) -> Action
token f x _ = EmitToken (f <$> x)
token f x _ = EmitToken (f . Text.unpack <$> x)

-- | Emit a token using the current lexeme
textToken :: (Text -> Token) -> Action
textToken f x _ = EmitToken (f . Text.pack <$> x)
textToken f x _ = EmitToken (f <$> x)

-- | Attempt to parse the current lexeme as a date-time token.
timeValue ::
Expand All @@ -143,19 +143,19 @@ timeValue ::
(a -> Token) {- ^ token constructor -} ->
Action
timeValue description patterns constructor (Located p str) _ =
case asum [parseTimeM False defaultTimeLocale pat str | pat <- patterns] of
case asum [parseTimeM False defaultTimeLocale pat (Text.unpack str) | pat <- patterns] of
Nothing -> LexerError (Located p ("malformed " ++ description))
Just t -> EmitToken (Located p (constructor t))

-- | Pop the first character off a located string if it's not empty.
-- The resulting 'Int' will either be the ASCII value of the character
-- or @1@ for non-ASCII Unicode values. To avoid a clash, @\x1@ is
-- remapped to @0@.
locatedUncons :: Located String -> Maybe (Int, Located String)
locatedUncons :: Located Text -> Maybe (Int, Located Text)
locatedUncons Located { locPosition = p, locThing = str } =
case str of
"" -> Nothing
x:xs
case Text.uncons str of
Nothing -> Nothing
Just (x, xs)
| rest `seq` False -> undefined
| x == '\1' -> Just (0, rest)
| isAscii x -> Just (ord x, rest)
Expand All @@ -164,12 +164,12 @@ locatedUncons Located { locPosition = p, locThing = str } =
rest = Located { locPosition = move x p, locThing = xs }

-- | Generate the correct terminating token given the current lexer state.
eofToken :: Context -> Located String -> Either (Located String) (Located Token, Located String)
eofToken :: Context -> Located Text -> Either (Located String) (Located Token, Located Text)
eofToken (MlBstrContext p _) _ = Left (Located p "unterminated multi-line basic string")
eofToken (BstrContext p _) _ = Left (Located p "unterminated basic string")
eofToken (MlLstrContext p _) _ = Left (Located p "unterminated multi-line literal string")
eofToken (LstrContext p _) _ = Left (Located p "unterminated literal string")
eofToken _ t = Right (TokEOF <$ t, t)
eofToken _ t = Right (TokEOF <$ t, t)

failure :: String -> Action
failure err t _ = LexerError (err <$ t)
Expand Down
2 changes: 1 addition & 1 deletion src/Toml/Parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ pop :: { () }
-- | Parse a list of tokens either returning the first unexpected
-- token or a list of the TOML statements in the file to be
-- processed by "Toml.Semantics".
parseRawToml :: String -> Either (Located String) [Expr Position]
parseRawToml :: Text -> Either (Located String) [Expr Position]
parseRawToml = runParser parseRawToml_ TopContext . Located startPos

}
6 changes: 3 additions & 3 deletions src/Toml/Parser/Utils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ import Toml.Position (Position)
-- continuation passing implementation of a state monad with errors
newtype Parser r a = P {
getP ::
[Context] -> Located String ->
([Context] -> Located String -> a -> Either (Located String) r) ->
[Context] -> Located Text ->
([Context] -> Located Text -> a -> Either (Located String) r) ->
Either (Located String) r
}

-- | Run the top-level parser
runParser :: Parser r r -> Context -> Located String -> Either (Located String) r
runParser :: Parser r r -> Context -> Located Text -> Either (Located String) r
runParser (P k) ctx str = k [ctx] str \_ _ r -> Right r

-- | Bind implementation used in the happy-generated parser
Expand Down
10 changes: 6 additions & 4 deletions test-drivers/decoder/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ module Main (main) where

import Data.Aeson qualified as Aeson
import Data.ByteString.Lazy qualified as BS
import Data.Text qualified as Text
import Data.Text.IO qualified as Text
import Toml (Value(..), Value'(..), parse, Table'(..))
import Toml.Pretty (prettyValue)

main :: IO ()
main =
do txt <- getContents
do txt <- Text.getContents
case parse txt of
Left e -> fail e
Right t -> BS.putStr (Aeson.encode t)
Expand All @@ -31,10 +33,10 @@ instance Aeson.ToJSON (Toml.Value' a) where
toJSON v =
case v of
Table' _ t -> Aeson.toJSON t
Array' _ a -> Aeson.toJSON a
String' _ s -> simple "string" s
List' _ a -> Aeson.toJSON a
Text' _ s -> simple "string" (Text.unpack s)
Integer' _ _ -> simple "integer" (show (prettyValue v))
Float' _ _ -> simple "float" (show (prettyValue v))
Double' _ _ -> simple "float" (show (prettyValue v))
Bool' _ _ -> simple "bool" (show (prettyValue v))
TimeOfDay' _ _ -> simple "time-local" (show (prettyValue v))
ZonedTime' _ _ -> simple "datetime" (show (prettyValue v))
Expand Down
16 changes: 9 additions & 7 deletions test-drivers/encoder/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import Data.Aeson.Types qualified as Aeson
import Data.ByteString.Lazy qualified as BS
import Data.Foldable (toList)
import Data.Map qualified as Map
import Data.Text (Text)
import Data.Text qualified as Text
import System.Exit (exitFailure)
import Toml (prettyToml, Value(..), Value'(..), Table)
import Toml.Lexer (lexValue, Token(..))
Expand All @@ -34,26 +36,26 @@ instance a ~ () => Aeson.FromJSON (Toml.Value' a) where
parseJSON =
mconcat [
Aeson.withArray "array" \xs ->
Toml.Array <$> traverse Aeson.parseJSON (toList xs),
Toml.List <$> traverse Aeson.parseJSON (toList xs),
Aeson.withObject "value" \o ->
do ty <- o Aeson..: "type"
vl <- o Aeson..: "value"
decodeValue ty vl,
fmap (toValue :: Map.Map String Value -> Value) . Aeson.parseJSON
]

decodeValue :: String -> String -> Aeson.Parser Toml.Value
decodeValue "string" x = pure (Toml.String x)
decodeValue :: String -> Text -> Aeson.Parser Toml.Value
decodeValue "string" x = pure (Toml.Text x)
decodeValue "bool" (lexValue -> Right TokTrue ) = pure (Toml.Bool True)
decodeValue "bool" (lexValue -> Right TokFalse ) = pure (Toml.Bool False)
decodeValue "integer" (lexValue -> Right (TokInteger x)) = pure (Toml.Integer x)
decodeValue "time-local" (lexValue -> Right (TokLocalTime x)) = pure (Toml.TimeOfDay x)
decodeValue "datetime" (lexValue -> Right (TokOffsetDateTime x)) = pure (Toml.ZonedTime x)
decodeValue "datetime-local" (lexValue -> Right (TokLocalDateTime x)) = pure (Toml.LocalTime x)
decodeValue "date-local" (lexValue -> Right (TokLocalDate x)) = pure (Toml.Day x)
decodeValue "float" (lexValue -> Right (TokFloat x)) = pure (Toml.Float x)
decodeValue "float" (lexValue -> Right (TokFloat x)) = pure (Toml.Double x)
-- toml-tests are inconsistent about representing floating point numbers
decodeValue "float" (lexValue -> Right (TokInteger x)) = pure (Toml.Float (fromInteger x))
decodeValue "float" "+Inf" = pure (Toml.Float (1/0))
decodeValue "float" "-Inf" = pure (Toml.Float (-1/0))
decodeValue "float" (lexValue -> Right (TokInteger x)) = pure (Toml.Double (fromInteger x))
decodeValue "float" "+Inf" = pure (Toml.Double (1/0))
decodeValue "float" "-Inf" = pure (Toml.Double (-1/0))
decodeValue _ _ = empty
3 changes: 2 additions & 1 deletion test-drivers/highlighter/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Decode TOML into JSON for use with <https://github.com/BurntSushi/toml-test>
-}
module Main (main) where

import Data.Text.IO qualified as Text
import Prettyprinter.Render.Terminal
import Toml.Parser (parseRawToml)
import Toml.Pretty (prettyTomlOrdered, DocClass(..), prettyLocated, prettySemanticError)
Expand All @@ -18,7 +19,7 @@ import Toml.Semantics.Ordered (extractTableOrder, projectKey)

main :: IO ()
main =
do txt <- getContents
do txt <- Text.getContents
case parseRawToml txt of
Left e -> fail (prettyLocated e)
Right exprs ->
Expand Down
3 changes: 3 additions & 0 deletions test-drivers/toml-test-drivers.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ executable TomlDecoder
build-depends:
aeson ^>= {2.1, 2.2},
bytestring ^>= {0.10, 0.11, 0.12},
text,

executable TomlEncoder
import: shared
Expand All @@ -48,6 +49,7 @@ executable TomlEncoder
aeson ^>= {2.1, 2.2},
bytestring ^>= {0.10, 0.11, 0.12},
containers ^>= {0.5, 0.6},
text,

executable TomlHighlighter
import: shared
Expand All @@ -56,3 +58,4 @@ executable TomlHighlighter
build-depends:
prettyprinter ^>= 1.7.1,
prettyprinter-ansi-terminal ^>= 1.1.3,
text,
Loading

0 comments on commit 4c547ce

Please sign in to comment.