From 303a73f82bf216274c1aa734cbfde9c8598dc278 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 1 Dec 2024 14:42:55 -0800
Subject: [PATCH] Parse base64-encoded data URIs more efficiently

(in some places)

Very long data: URIs in source documents are causing outsized memory
usage due to various parsing inefficiencies, for instance in
Network.URI, TagSoup, and T.P.R.Markdown.source. See e.g. #10075.

This change improves the situation in a couple places we can control
relatively easily by using an attoparsec text-specialized parser to
consume base64-encoded strings. Attoparsec's takeWhile + inClass
functions are designed to chew through long strings like this without
doing unnecessary allocation, and the improvements in peak heap
allocation are significant.

One of the observations here is that if you parse something as a valid
data: uri it shouldn't need any further escaping so we can short-circuit
various processing steps that may unpack/iterate over the chars in the
URI.
---
 pandoc.cabal                        |  1 +
 src/Text/Pandoc/Parsing.hs          |  4 ++-
 src/Text/Pandoc/Parsing/Base64.hs   | 39 ++++++++++++++++++++++++++++
 src/Text/Pandoc/Readers/HTML.hs     | 14 +++++-----
 src/Text/Pandoc/Readers/Markdown.hs | 26 +++++++++++++++++--
 src/Text/Pandoc/URI.hs              | 40 ++++++++++++++++++++++++++---
 6 files changed, 111 insertions(+), 13 deletions(-)
 create mode 100644 src/Text/Pandoc/Parsing/Base64.hs

diff --git a/pandoc.cabal b/pandoc.cabal
index 32ebcd43a6aa..812f620281e8 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -679,6 +679,7 @@ library
                    Text.Pandoc.Class.Sandbox,
                    Text.Pandoc.Filter.Environment,
                    Text.Pandoc.Filter.JSON,
+                   Text.Pandoc.Parsing.Base64,
                    Text.Pandoc.Parsing.Capabilities,
                    Text.Pandoc.Parsing.Citations,
                    Text.Pandoc.Parsing.General,
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 67fcc363dfc4..548c28425869 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
                              Column,
                              ParseError,
                              errorMessages,
-                             messageString
+                             messageString,
+                             parseBase64String
                              )
 where
 
@@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State
       SubstTable )
 import Text.Pandoc.Parsing.Future
     ( askF, asksF, returnF, runF, Future(..) )
+import Text.Pandoc.Parsing.Base64 (parseBase64String)
diff --git a/src/Text/Pandoc/Parsing/Base64.hs b/src/Text/Pandoc/Parsing/Base64.hs
new file mode 100644
index 000000000000..4330a83d14de
--- /dev/null
+++ b/src/Text/Pandoc/Parsing/Base64.hs
@@ -0,0 +1,39 @@
+{-# LANGUAGE FlexibleContexts           #-}
+{-# LANGUAGE OverloadedStrings          #-}
+{- |
+Module      : Text.Pandoc.Parsing.Base64
+Copyright   : © 2024 Evan Silberman
+License     : GPL-2.0-or-later
+Maintainer  : John MacFarlane <jgm@berkeley.edu>
+
+Parse large base64 strings efficiently within Pandoc's
+normal parsing environment
+-}
+
+module Text.Pandoc.Parsing.Base64
+  ( parseBase64String )
+
+where
+
+import Data.Text as T
+import Data.Attoparsec.Text as A
+import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn)
+import Text.Pandoc.Sources
+import Control.Monad (mzero)
+
+parseBase64String :: Monad m => ParsecT Sources u m Text
+parseBase64String = do
+  Sources ((pos, txt):rest) <- getInput
+  let r = A.parse pBase64 txt
+  case r of
+    Done remaining consumed -> do
+      let pos' = incSourceColumn pos (T.length consumed)
+      setInput $ Sources ((pos', remaining):rest)
+      return consumed
+    _ -> mzero
+
+pBase64 :: A.Parser Text
+pBase64 = do
+  most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/")
+  rest <- A.takeWhile (== '=')
+  return $ most <> rest
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 9704b047e0f1..ff6bbd881dd8 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Shared (
     addMetaField, extractSpaces, htmlSpanLikeElements, renderTags',
     safeRead, tshow, formatCode)
-import Text.Pandoc.URI (escapeURI)
+import Text.Pandoc.URI (escapeURI, isBase64DataURI)
 import Text.Pandoc.Walk
 import Text.TeXMath (readMathML, writeTeX)
 import qualified Data.Sequence as Seq
@@ -1219,8 +1219,10 @@ htmlTag f = try $ do
 
 -- | Adjusts a url according to the document's base URL.
 canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
-canonicalizeUrl url = do
-  mbBaseHref <- baseHref <$> getState
-  return $ case (parseURIReference (T.unpack url), mbBaseHref) of
-                (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
-                _                   -> url
+canonicalizeUrl url 
+  | isBase64DataURI url = return url
+  | otherwise = do
+     mbBaseHref <- baseHref <$> getState
+     return $ case (parseURIReference (T.unpack url), mbBaseHref) of
+                   (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
+                   _                   -> url
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 51782f00015f..b01c937f7c92 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1822,13 +1822,35 @@ parenthesizedChars = do
   result <- charsInBalanced '(' ')' litChar
   return $ "(" <> result <> ")"
 
+pBase64DataURI :: PandocMonad m => MarkdownParser m Text
+pBase64DataURI = mconcat <$> sequence
+  [ (textStr "data:")
+  , (T.singleton <$> (letter <|> digit))
+  , restrictedName
+  , (T.singleton <$> char '/')
+  , restrictedName
+  , textStr ";" <* trace "cool"
+  , (mconcat <$> many mediaParam)
+  , textStr "base64," <* trace "fine"
+  , parseBase64String
+  ]
+ where
+    restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-")
+    mediaParam = mconcat <$> sequence
+      [ notFollowedBy (textStr "base64,") *> mempty -- XXX ???
+      , restrictedName
+      , textStr "="
+      , manyChar (noneOf ";")
+      , textStr ";"
+      ]
+
 -- source for a link, with optional title
 source :: PandocMonad m => MarkdownParser m (Text, Text)
 source = do
   char '('
   skipSpaces
-  let urlChunk =
-            try parenthesizedChars
+  let urlChunk = try pBase64DataURI
+        <|> try parenthesizedChars
         <|> (notFollowedBy (oneOf " )") >> litChar)
         <|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')"))
   let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs
index a0b47d259cd5..5052b4c6c514 100644
--- a/src/Text/Pandoc/URI.hs
+++ b/src/Text/Pandoc/URI.hs
@@ -15,12 +15,16 @@ module Text.Pandoc.URI ( urlEncode
                        , isURI
                        , schemes
                        , uriPathToPath
+                       , isBase64DataURI
                        ) where
 import qualified Network.HTTP.Types as HTTP
 import qualified Text.Pandoc.UTF8 as UTF8
 import qualified Data.Text as T
+import qualified Data.Attoparsec.Text as A
 import qualified Data.Set as Set
 import Data.Char (isSpace, isAscii)
+import Data.Either (isRight)
+import Control.Applicative ((<|>))
 import Network.URI (URI (uriScheme), parseURI, escapeURIString)
 
 urlEncode :: T.Text -> T.Text
@@ -28,7 +32,9 @@ urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText
 
 -- | Escape whitespace and some punctuation characters in URI.
 escapeURI :: T.Text -> T.Text
-escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack
+escapeURI t
+  | isBase64DataURI t = t
+  | otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t
   where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`"
 
 --
@@ -87,12 +93,38 @@ schemes = Set.fromList
   , "doi", "gemini", "isbn", "javascript", "pmid"
   ]
 
+isBase64DataURI :: T.Text -> Bool
+isBase64DataURI =
+  isRight . A.parseOnly pBase64DataURI
+
+pBase64DataURI :: A.Parser T.Text
+pBase64DataURI = mconcat <$> sequence
+  [ (A.string "data:")
+  , (T.singleton <$> (A.letter <|> A.digit))
+  , restrictedName
+  , (T.singleton <$> A.char '/')
+  , restrictedName
+  , (mconcat <$> A.many' mediaParam)
+  , A.string ";base64,"
+  , A.takeWhile1 (A.inClass "A-Za-z0-9+/")
+  , A.takeWhile (== '=')
+  ]
+ where
+    restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-")
+    mediaParam = mconcat <$> sequence
+      [ (T.singleton <$> A.char ';')
+      , restrictedName
+      , (T.singleton <$> A.char '=')
+      , A.takeWhile (/= ';') 
+      ]
+
 -- | Check if the string is a valid URL with a IANA or frequently used but
 -- unofficial scheme (see @schemes@).
 isURI :: T.Text -> Bool
-isURI =
-  -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
-  maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
+isURI t =
+  isBase64DataURI t ||
+    -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
+    (maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t
   where
     hasKnownScheme = (`Set.member` schemes) . T.toLower .
                      T.filter (/= ':') . T.pack . uriScheme