Parse base64-encoded data URIs more efficiently

(in some places) Very long data: URIs in source documents are causing outsized memory usage due to various parsing inefficiencies, for instance in Network.URI, TagSoup, and T.P.R.Markdown.source. See e.g. jgm#10075. This change improves the situation in a couple places we can control relatively easily by using an attoparsec text-specialized parser to consume base64-encoded strings. Attoparsec's takeWhile + inClass functions are designed to chew through long strings like this without doing unnecessary allocation, and the improvements in peak heap allocation are significant. One of the observations here is that if you parse something as a valid data: uri it shouldn't need any further escaping so we can short-circuit various processing steps that may unpack/iterate over the chars in the URI.
silby · Dec 4, 2024 · 303a73f · 303a73f
1 parent c34edf6
commit 303a73f
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 13 deletions.
diff --git a/pandoc.cabal b/pandoc.cabal
@@ -679,6 +679,7 @@ library
                    Text.Pandoc.Class.Sandbox,
                    Text.Pandoc.Filter.Environment,
                    Text.Pandoc.Filter.JSON,
+                   Text.Pandoc.Parsing.Base64,
                    Text.Pandoc.Parsing.Capabilities,
                    Text.Pandoc.Parsing.Citations,
                    Text.Pandoc.Parsing.General,

diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
@@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
                              Column,
                              ParseError,
                              errorMessages,
-                             messageString
+                             messageString,
+                             parseBase64String
                              )
 where
 
@@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State
       SubstTable )
 import Text.Pandoc.Parsing.Future
     ( askF, asksF, returnF, runF, Future(..) )
+import Text.Pandoc.Parsing.Base64 (parseBase64String)
diff --git a/src/Text/Pandoc/Parsing/Base64.hs b/src/Text/Pandoc/Parsing/Base64.hs
@@ -0,0 +1,39 @@
+{-# LANGUAGE FlexibleContexts           #-}
+{-# LANGUAGE OverloadedStrings          #-}
+{- |
+Module      : Text.Pandoc.Parsing.Base64
+Copyright   : © 2024 Evan Silberman
+License     : GPL-2.0-or-later
+Maintainer  : John MacFarlane <jgm@berkeley.edu>
+
+Parse large base64 strings efficiently within Pandoc's
+normal parsing environment
+-}
+
+module Text.Pandoc.Parsing.Base64
+  ( parseBase64String )
+
+where
+
+import Data.Text as T
+import Data.Attoparsec.Text as A
+import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn)
+import Text.Pandoc.Sources
+import Control.Monad (mzero)
+
+parseBase64String :: Monad m => ParsecT Sources u m Text
+parseBase64String = do
+  Sources ((pos, txt):rest) <- getInput
+  let r = A.parse pBase64 txt
+  case r of
+    Done remaining consumed -> do
+      let pos' = incSourceColumn pos (T.length consumed)
+      setInput $ Sources ((pos', remaining):rest)
+      return consumed
+    _ -> mzero
+
+pBase64 :: A.Parser Text
+pBase64 = do
+  most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/")
+  rest <- A.takeWhile (== '=')
+  return $ most <> rest
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
@@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Shared (
     addMetaField, extractSpaces, htmlSpanLikeElements, renderTags',
     safeRead, tshow, formatCode)
-import Text.Pandoc.URI (escapeURI)
+import Text.Pandoc.URI (escapeURI, isBase64DataURI)
 import Text.Pandoc.Walk
 import Text.TeXMath (readMathML, writeTeX)
 import qualified Data.Sequence as Seq
@@ -1219,8 +1219,10 @@ htmlTag f = try $ do
 
 -- | Adjusts a url according to the document's base URL.
 canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
-canonicalizeUrl url = do
-  mbBaseHref <- baseHref <$> getState
-  return $ case (parseURIReference (T.unpack url), mbBaseHref) of
-                (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
-                _                   -> url
+canonicalizeUrl url 
+  | isBase64DataURI url = return url
+  | otherwise = do
+     mbBaseHref <- baseHref <$> getState
+     return $ case (parseURIReference (T.unpack url), mbBaseHref) of
+                   (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
+                   _                   -> url
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1822,13 +1822,35 @@ parenthesizedChars = do
   result <- charsInBalanced '(' ')' litChar
   return $ "(" <> result <> ")"
 
+pBase64DataURI :: PandocMonad m => MarkdownParser m Text
+pBase64DataURI = mconcat <$> sequence
+  [ (textStr "data:")
+  , (T.singleton <$> (letter <|> digit))
+  , restrictedName
+  , (T.singleton <$> char '/')
+  , restrictedName
+  , textStr ";" <* trace "cool"
+  , (mconcat <$> many mediaParam)
+  , textStr "base64," <* trace "fine"
+  , parseBase64String
+  ]
+ where
+    restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-")
+    mediaParam = mconcat <$> sequence
+      [ notFollowedBy (textStr "base64,") *> mempty -- XXX ???
+      , restrictedName
+      , textStr "="
+      , manyChar (noneOf ";")
+      , textStr ";"
+      ]
+
 -- source for a link, with optional title
 source :: PandocMonad m => MarkdownParser m (Text, Text)
 source = do
   char '('
   skipSpaces
-  let urlChunk =
-            try parenthesizedChars
+  let urlChunk = try pBase64DataURI
+        <|> try parenthesizedChars
         <|> (notFollowedBy (oneOf " )") >> litChar)
         <|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')"))
   let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk

diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs
@@ -15,20 +15,26 @@ module Text.Pandoc.URI ( urlEncode
                        , isURI
                        , schemes
                        , uriPathToPath
+                       , isBase64DataURI
                        ) where
 import qualified Network.HTTP.Types as HTTP
 import qualified Text.Pandoc.UTF8 as UTF8
 import qualified Data.Text as T
+import qualified Data.Attoparsec.Text as A
 import qualified Data.Set as Set
 import Data.Char (isSpace, isAscii)
+import Data.Either (isRight)
+import Control.Applicative ((<|>))
 import Network.URI (URI (uriScheme), parseURI, escapeURIString)
 
 urlEncode :: T.Text -> T.Text
 urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText
 
 -- | Escape whitespace and some punctuation characters in URI.
 escapeURI :: T.Text -> T.Text
-escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack
+escapeURI t
+  | isBase64DataURI t = t
+  | otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t
   where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`"
 
 --
@@ -87,12 +93,38 @@ schemes = Set.fromList
   , "doi", "gemini", "isbn", "javascript", "pmid"
   ]
 
+isBase64DataURI :: T.Text -> Bool
+isBase64DataURI =
+  isRight . A.parseOnly pBase64DataURI
+
+pBase64DataURI :: A.Parser T.Text
+pBase64DataURI = mconcat <$> sequence
+  [ (A.string "data:")
+  , (T.singleton <$> (A.letter <|> A.digit))
+  , restrictedName
+  , (T.singleton <$> A.char '/')
+  , restrictedName
+  , (mconcat <$> A.many' mediaParam)
+  , A.string ";base64,"
+  , A.takeWhile1 (A.inClass "A-Za-z0-9+/")
+  , A.takeWhile (== '=')
+  ]
+ where
+    restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-")
+    mediaParam = mconcat <$> sequence
+      [ (T.singleton <$> A.char ';')
+      , restrictedName
+      , (T.singleton <$> A.char '=')
+      , A.takeWhile (/= ';') 
+      ]
+
 -- | Check if the string is a valid URL with a IANA or frequently used but
 -- unofficial scheme (see @schemes@).
 isURI :: T.Text -> Bool
-isURI =
-  -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
-  maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
+isURI t =
+  isBase64DataURI t ||
+    -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
+    (maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t
   where
     hasKnownScheme = (`Set.member` schemes) . T.toLower .
                      T.filter (/= ':') . T.pack . uriScheme