Run the lexer on Text

glguy · Feb 20, 2024 · 4c547ce · 4c547ce
1 parent e7f4d3f
commit 4c547ce
Show file tree

Hide file tree

Showing 13 changed files with 74 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -42,6 +42,7 @@ to ensure that its code typechecks and stays in sync with the rest of the packag
 
 ```haskell
 {-# Language OverloadedStrings #-}
+import Data.Text (Text)
 import GHC.Generics (Generic)
 import QuoteStr (quoteStr)
 import Test.Hspec (Spec, hspec, it, shouldBe)
@@ -59,7 +60,7 @@ main = hspec (parses >> decodes >> encodes)
 Consider this sample TOML text from the TOML specification.
 
 ```haskell
-fruitStr :: String
+fruitStr :: Text
 fruitStr = [quoteStr|
 ```
 

diff --git a/src/Toml.hs b/src/Toml.hs
@@ -38,6 +38,7 @@ module Toml (
     DocClass(..),
     ) where
 
+import Data.Text (Text)
 import Toml.FromValue (FromValue (fromValue), Result(..))
 import Toml.FromValue.Matcher (runMatcher)
 import Toml.Located (Located(..))
@@ -49,7 +50,7 @@ import Toml.ToValue (ToTable (toTable))
 import Toml.Value
 
 -- | Parse a TOML formatted 'String' or report an error message.
-parse' :: String -> Either String (Table' Position)
+parse' :: Text -> Either String (Table' Position)
 parse' str =
     case parseRawToml str of
         Left e -> Left (prettyLocated e)
@@ -59,11 +60,11 @@ parse' str =
                 Right tab -> Right tab
 
 -- | Parse a TOML formatted 'String' or report an error message.
-parse :: String -> Either String Table
+parse :: Text -> Either String Table
 parse = fmap forgetTableAnns . parse'
 
 -- | Use the 'FromValue' instance to decode a value from a TOML string.
-decode :: FromValue a => String -> Result String a
+decode :: FromValue a => Text -> Result String a
 decode str =
     case parse' str of
         Left e -> Failure [e]

diff --git a/src/Toml/Lexer.x b/src/Toml/Lexer.x
@@ -19,6 +19,8 @@ This module uses actions and lexical hooks defined in
 -}
 module Toml.Lexer (Context(..), scanToken, lexValue, Token(..)) where
 
+import Data.Text (Text)
+import Data.Text qualified as Text
 import Toml.Lexer.Token
 import Toml.Lexer.Utils
 import Toml.Located
@@ -139,59 +141,59 @@ $wschar+;
 
 <lstr> {
   $literal_char+    { strFrag                           }
-  "'"               { endStr . fmap (drop 1)            }
+  "'"               { endStr . fmap (Text.drop 1)            }
 }
 
 <bstr> {
   $unescaped+       { strFrag                           }
-  \"                { endStr . fmap (drop 1)            }
+  \"                { endStr . fmap (Text.drop 1)            }
 }
 
 <mllstr> {
   @mll_content+     { strFrag                           }
   "'" {1,2}         { strFrag                           }
-  "'" {3,5}         { endStr . fmap (drop 3)            }
+  "'" {3,5}         { endStr . fmap (Text.drop 3)            }
 }
 
 <mlbstr> {
   @mlb_escaped_nl;
   ($unescaped | @newline)+ { strFrag                    }
   \" {1,2}          { strFrag                           }
-  \" {3,5}          { endStr . fmap (drop 3)            }
+  \" {3,5}          { endStr . fmap (Text.drop 3)            }
 }
 
 <mlbstr, bstr> {
   \\ U $hexdig{8}   { unicodeEscape                     }
   \\ U              { failure "\\U requires exactly 8 hex digits"}
   \\ u $hexdig{4}   { unicodeEscape                     }
   \\ u              { failure "\\u requires exactly 4 hex digits"}
-  \\ n              { strFrag . ("\n" <$)               }
-  \\ t              { strFrag . ("\t" <$)               }
-  \\ r              { strFrag . ("\r" <$)               }
-  \\ f              { strFrag . ("\f" <$)               }
-  \\ b              { strFrag . ("\b" <$)               }
-  \\ \\             { strFrag . ("\\" <$)               }
-  \\ \"             { strFrag . ("\"" <$)               }
+  \\ n              { strFrag . (Text.singleton '\n' <$)               }
+  \\ t              { strFrag . (Text.singleton '\t' <$)               }
+  \\ r              { strFrag . (Text.singleton '\r' <$)               }
+  \\ f              { strFrag . (Text.singleton '\f' <$)               }
+  \\ b              { strFrag . (Text.singleton '\b' <$)               }
+  \\ \\             { strFrag . (Text.singleton '\\' <$)               }
+  \\ \"             { strFrag . (Text.singleton '\"' <$)               }
   $control # [\t\r\n] { recommendEscape                 }
 }
 
 {
 
-type AlexInput = Located String
+type AlexInput = Located Text
 
 alexGetByte :: AlexInput -> Maybe (Int, AlexInput)
 alexGetByte = locatedUncons
 
 -- | Get the next token from a located string. This function can be total
 -- because one of the possible token outputs is an error token.
-scanToken :: Context -> Located String -> Either (Located String) (Located Token, Located String)
+scanToken :: Context -> Located Text -> Either (Located String) (Located Token, Located Text)
 scanToken st str =
   case alexScan str (stateInt st) of
     AlexEOF          -> eofToken st str
-    AlexError str'   -> Left (mkError <$> str')
+    AlexError str'   -> Left (mkError . Text.unpack <$> str')
     AlexSkip  str' _ -> scanToken st str'
     AlexToken str' n action ->
-      case action (take n <$> str) st of
+      case action (Text.take n <$> str) st of
         Resume st'   -> scanToken st' str'
         LexerError e -> Left e
         EmitToken  t -> Right (t, str')
@@ -206,7 +208,7 @@ stateInt LstrContext  {} = lstr
 stateInt MlLstrContext{} = mllstr
 
 -- | Lex a single token in a value context. This is mostly useful for testing.
-lexValue :: String -> Either String Token
+lexValue :: Text -> Either String Token
 lexValue str =
     case scanToken ValueContext Located{ locPosition = startPos, locThing = str } of
       Left e -> Left (locThing e)

diff --git a/src/Toml/Lexer/Utils.hs b/src/Toml/Lexer/Utils.hs
@@ -58,7 +58,7 @@ import Toml.Located (Located(..))
 import Toml.Position (move, Position)
 
 -- | Type of actions associated with lexer patterns
-type Action = Located String -> Context -> Outcome
+type Action = Located Text -> Context -> Outcome
 
 data Outcome
   = Resume Context
@@ -70,10 +70,10 @@ data Context
   = TopContext -- ^ top-level where @[[@ and @]]@ have special meaning
   | TableContext -- ^ inline table - lex key names
   | ValueContext -- ^ value lexer - lex number literals
-  | MlBstrContext Position [String] -- ^ multiline basic string: position of opening delimiter and list of fragments
-  | BstrContext   Position [String] -- ^ basic string: position of opening delimiter and list of fragments
-  | MlLstrContext Position [String] -- ^ multiline literal string: position of opening delimiter and list of fragments
-  | LstrContext   Position [String] -- ^ literal string: position of opening delimiter and list of fragments
+  | MlBstrContext Position [Text] -- ^ multiline basic string: position of opening delimiter and list of fragments
+  | BstrContext   Position [Text] -- ^ basic string: position of opening delimiter and list of fragments
+  | MlLstrContext Position [Text] -- ^ multiline literal string: position of opening delimiter and list of fragments
+  | LstrContext   Position [Text] -- ^ literal string: position of opening delimiter and list of fragments
   deriving Show
 
 -- | Add a literal fragment of a string to the current string state.
@@ -88,10 +88,10 @@ strFrag (Located _ s) = \case
 -- | End the current string state and emit the string literal token.
 endStr :: Action
 endStr (Located _ x) = \case
-    BstrContext   p acc -> EmitToken (Located p (TokString   (Text.pack (concat (reverse (x : acc))))))
-    MlBstrContext p acc -> EmitToken (Located p (TokMlString (Text.pack (concat (reverse (x : acc))))))
-    LstrContext   p acc -> EmitToken (Located p (TokString   (Text.pack (concat (reverse (x : acc))))))
-    MlLstrContext p acc -> EmitToken (Located p (TokMlString (Text.pack (concat (reverse (x : acc))))))
+    BstrContext   p acc -> EmitToken (Located p (TokString   (Text.concat (reverse (x : acc)))))
+    MlBstrContext p acc -> EmitToken (Located p (TokMlString (Text.concat (reverse (x : acc)))))
+    LstrContext   p acc -> EmitToken (Located p (TokString   (Text.concat (reverse (x : acc)))))
+    MlLstrContext p acc -> EmitToken (Located p (TokMlString (Text.concat (reverse (x : acc)))))
     _                  -> error "endStr: panic"
 
 -- | Start a basic string literal
@@ -113,27 +113,27 @@ startMlLstr (Located p _) _ = Resume (MlLstrContext p [])
 -- | Resolve a unicode escape sequence and add it to the current string literal
 unicodeEscape :: Action
 unicodeEscape (Located p lexeme) ctx =
-  case readHex (drop 2 lexeme) of
+  case readHex (drop 2 (Text.unpack lexeme)) of
     [(n,_)] | 0xd800 <= n, n < 0xe000 -> LexerError (Located p "non-scalar unicode escape")
       | n >= 0x110000                 -> LexerError (Located p "unicode escape too large")
-      | otherwise                     -> strFrag (Located p [chr n]) ctx
+      | otherwise                     -> strFrag (Located p (Text.singleton (chr n))) ctx
     _                                 -> error "unicodeEscape: panic"
 
 recommendEscape :: Action
 recommendEscape (Located p x) _ =
-  LexerError (Located p (printf "control characters must be escaped, use: \\u%04X" (ord (head x))))
+  LexerError (Located p (printf "control characters must be escaped, use: \\u%04X" (ord (Text.head x))))
 
 -- | Emit a token ignoring the current lexeme
 token_ :: Token -> Action
 token_ t x _ = EmitToken (t <$ x)
 
 -- | Emit a token using the current lexeme
 token :: (String -> Token) -> Action
-token f x _ = EmitToken (f <$> x)
+token f x _ = EmitToken (f . Text.unpack <$> x)
 
 -- | Emit a token using the current lexeme
 textToken :: (Text -> Token) -> Action
-textToken f x _ = EmitToken (f . Text.pack <$> x)
+textToken f x _ = EmitToken (f <$> x)
 
 -- | Attempt to parse the current lexeme as a date-time token.
 timeValue ::
@@ -143,19 +143,19 @@ timeValue ::
   (a -> Token) {- ^ token constructor              -} ->
   Action
 timeValue description patterns constructor (Located p str) _ =
-  case asum [parseTimeM False defaultTimeLocale pat str | pat <- patterns] of
+  case asum [parseTimeM False defaultTimeLocale pat (Text.unpack str) | pat <- patterns] of
     Nothing -> LexerError (Located p ("malformed " ++ description))
     Just t  -> EmitToken (Located p (constructor t))
 
 -- | Pop the first character off a located string if it's not empty.
 -- The resulting 'Int' will either be the ASCII value of the character
 -- or @1@ for non-ASCII Unicode values. To avoid a clash, @\x1@ is
 -- remapped to @0@.
-locatedUncons :: Located String -> Maybe (Int, Located String)
+locatedUncons :: Located Text -> Maybe (Int, Located Text)
 locatedUncons Located { locPosition = p, locThing = str } =
-  case str of
-    "" -> Nothing
-    x:xs
+  case Text.uncons str of
+    Nothing -> Nothing
+    Just (x, xs)
       | rest `seq` False -> undefined
       | x == '\1' -> Just (0,     rest)
       | isAscii x -> Just (ord x, rest)
@@ -164,12 +164,12 @@ locatedUncons Located { locPosition = p, locThing = str } =
         rest = Located { locPosition = move x p, locThing = xs }
 
 -- | Generate the correct terminating token given the current lexer state.
-eofToken :: Context -> Located String -> Either (Located String) (Located Token, Located String)
+eofToken :: Context -> Located Text -> Either (Located String) (Located Token, Located Text)
 eofToken (MlBstrContext p _) _ = Left (Located p "unterminated multi-line basic string")
 eofToken (BstrContext   p _) _ = Left (Located p "unterminated basic string")
 eofToken (MlLstrContext p _) _ = Left (Located p "unterminated multi-line literal string")
 eofToken (LstrContext   p _) _ = Left (Located p "unterminated literal string")
-eofToken _                  t = Right (TokEOF <$ t, t)
+eofToken _                   t = Right (TokEOF <$ t, t)
 
 failure :: String -> Action
 failure err t _ = LexerError (err <$ t)

diff --git a/src/Toml/Parser.y b/src/Toml/Parser.y
@@ -146,7 +146,7 @@ pop ::                { ()                                          }
 -- | Parse a list of tokens either returning the first unexpected
 -- token or a list of the TOML statements in the file to be
 -- processed by "Toml.Semantics".
-parseRawToml :: String -> Either (Located String) [Expr Position]
+parseRawToml :: Text -> Either (Located String) [Expr Position]
 parseRawToml = runParser parseRawToml_ TopContext . Located startPos
 
 }
diff --git a/src/Toml/Parser/Utils.hs b/src/Toml/Parser/Utils.hs
@@ -48,13 +48,13 @@ import Toml.Position (Position)
 -- continuation passing implementation of a state monad with errors
 newtype Parser r a = P {
     getP ::
-        [Context] -> Located String ->
-        ([Context] -> Located String -> a -> Either (Located String) r) ->
+        [Context] -> Located Text ->
+        ([Context] -> Located Text -> a -> Either (Located String) r) ->
         Either (Located String) r
     }
 
 -- | Run the top-level parser
-runParser :: Parser r r -> Context -> Located String -> Either (Located String) r
+runParser :: Parser r r -> Context -> Located Text -> Either (Located String) r
 runParser (P k) ctx str = k [ctx] str \_ _ r -> Right r
 
 -- | Bind implementation used in the happy-generated parser

diff --git a/test-drivers/decoder/Main.hs b/test-drivers/decoder/Main.hs
@@ -14,12 +14,14 @@ module Main (main) where
 
 import Data.Aeson qualified as Aeson
 import Data.ByteString.Lazy qualified as BS
+import Data.Text qualified as Text
+import Data.Text.IO qualified as Text
 import Toml (Value(..), Value'(..), parse, Table'(..))
 import Toml.Pretty (prettyValue)
 
 main :: IO ()
 main =
- do txt <- getContents
+ do txt <- Text.getContents
     case parse txt of
         Left e  -> fail e
         Right t -> BS.putStr (Aeson.encode t)
@@ -31,10 +33,10 @@ instance Aeson.ToJSON (Toml.Value' a) where
     toJSON v =
         case v of
             Table'     _ t -> Aeson.toJSON t
-            Array'     _ a -> Aeson.toJSON a
-            String'    _ s -> simple "string"         s
+            List'      _ a -> Aeson.toJSON a
+            Text'      _ s -> simple "string"         (Text.unpack s)
             Integer'   _ _ -> simple "integer"        (show (prettyValue v))
-            Float'     _ _ -> simple "float"          (show (prettyValue v))
+            Double'    _ _ -> simple "float"          (show (prettyValue v))
             Bool'      _ _ -> simple "bool"           (show (prettyValue v))
             TimeOfDay' _ _ -> simple "time-local"     (show (prettyValue v))
             ZonedTime' _ _ -> simple "datetime"       (show (prettyValue v))

diff --git a/test-drivers/encoder/Main.hs b/test-drivers/encoder/Main.hs
@@ -18,6 +18,8 @@ import Data.Aeson.Types qualified as Aeson
 import Data.ByteString.Lazy qualified as BS
 import Data.Foldable (toList)
 import Data.Map qualified as Map
+import Data.Text (Text)
+import Data.Text qualified as Text
 import System.Exit (exitFailure)
 import Toml (prettyToml, Value(..), Value'(..), Table)
 import Toml.Lexer (lexValue, Token(..))
@@ -34,26 +36,26 @@ instance a ~ () => Aeson.FromJSON (Toml.Value' a) where
     parseJSON =
         mconcat [
             Aeson.withArray "array" \xs ->
-                Toml.Array <$> traverse Aeson.parseJSON (toList xs),
+                Toml.List <$> traverse Aeson.parseJSON (toList xs),
             Aeson.withObject "value" \o ->
                 do ty <- o Aeson..: "type"
                    vl <- o Aeson..: "value"
                    decodeValue ty vl,
             fmap (toValue :: Map.Map String Value -> Value) . Aeson.parseJSON
         ]
 
-decodeValue :: String -> String -> Aeson.Parser Toml.Value
-decodeValue "string"         x                                         = pure (Toml.String    x)
+decodeValue :: String -> Text -> Aeson.Parser Toml.Value
+decodeValue "string"         x                                         = pure (Toml.Text      x)
 decodeValue "bool"           (lexValue -> Right TokTrue              ) = pure (Toml.Bool      True)
 decodeValue "bool"           (lexValue -> Right TokFalse             ) = pure (Toml.Bool      False)
 decodeValue "integer"        (lexValue -> Right (TokInteger        x)) = pure (Toml.Integer   x)
 decodeValue "time-local"     (lexValue -> Right (TokLocalTime      x)) = pure (Toml.TimeOfDay x)
 decodeValue "datetime"       (lexValue -> Right (TokOffsetDateTime x)) = pure (Toml.ZonedTime x)
 decodeValue "datetime-local" (lexValue -> Right (TokLocalDateTime  x)) = pure (Toml.LocalTime x)
 decodeValue "date-local"     (lexValue -> Right (TokLocalDate      x)) = pure (Toml.Day       x)
-decodeValue "float"          (lexValue -> Right (TokFloat          x)) = pure (Toml.Float     x)
+decodeValue "float"          (lexValue -> Right (TokFloat          x)) = pure (Toml.Double    x)
 -- toml-tests are inconsistent about representing floating point numbers
-decodeValue "float"          (lexValue -> Right (TokInteger        x)) = pure (Toml.Float (fromInteger x))
-decodeValue "float"          "+Inf"                                    = pure (Toml.Float (1/0))
-decodeValue "float"          "-Inf"                                    = pure (Toml.Float (-1/0))
+decodeValue "float"          (lexValue -> Right (TokInteger        x)) = pure (Toml.Double (fromInteger x))
+decodeValue "float"          "+Inf"                                    = pure (Toml.Double (1/0))
+decodeValue "float"          "-Inf"                                    = pure (Toml.Double (-1/0))
 decodeValue _                _                                         = empty
diff --git a/test-drivers/highlighter/Main.hs b/test-drivers/highlighter/Main.hs
@@ -10,6 +10,7 @@ Decode TOML into JSON for use with <https://github.com/BurntSushi/toml-test>
 -}
 module Main (main) where
 
+import Data.Text.IO qualified as Text
 import Prettyprinter.Render.Terminal
 import Toml.Parser (parseRawToml)
 import Toml.Pretty (prettyTomlOrdered, DocClass(..), prettyLocated, prettySemanticError)
@@ -18,7 +19,7 @@ import Toml.Semantics.Ordered (extractTableOrder, projectKey)
 
 main :: IO ()
 main =
- do txt <- getContents
+ do txt <- Text.getContents
     case parseRawToml txt of
         Left e -> fail (prettyLocated e)
         Right exprs ->

diff --git a/test-drivers/toml-test-drivers.cabal b/test-drivers/toml-test-drivers.cabal
@@ -39,6 +39,7 @@ executable TomlDecoder
     build-depends:
         aeson           ^>= {2.1, 2.2},
         bytestring      ^>= {0.10, 0.11, 0.12},
+        text,
 
 executable TomlEncoder
     import:             shared
@@ -48,6 +49,7 @@ executable TomlEncoder
         aeson           ^>= {2.1, 2.2},
         bytestring      ^>= {0.10, 0.11, 0.12},
         containers      ^>= {0.5, 0.6},
+        text,
 
 executable TomlHighlighter
     import:             shared
@@ -56,3 +58,4 @@ executable TomlHighlighter
     build-depends:
         prettyprinter   ^>= 1.7.1,
         prettyprinter-ansi-terminal ^>= 1.1.3,
+        text,