jgm · silby · Dec 1, 2024 · jgm · Dec 4, 2024 · silby
diff --git a/pandoc.cabal b/pandoc.cabal
@@ -679,6 +679,7 @@ library
                    Text.Pandoc.Class.Sandbox,
                    Text.Pandoc.Filter.Environment,
                    Text.Pandoc.Filter.JSON,
+                   Text.Pandoc.Parsing.Base64,
                    Text.Pandoc.Parsing.Capabilities,
                    Text.Pandoc.Parsing.Citations,
                    Text.Pandoc.Parsing.General,

diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
@@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
                              Column,
                              ParseError,
                              errorMessages,
-                             messageString
+                             messageString,
+                             parseBase64String
                              )
 where
 
@@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State
       SubstTable )
 import Text.Pandoc.Parsing.Future
     ( askF, asksF, returnF, runF, Future(..) )
+import Text.Pandoc.Parsing.Base64 (parseBase64String)
diff --git a/src/Text/Pandoc/Parsing/Base64.hs b/src/Text/Pandoc/Parsing/Base64.hs
@@ -0,0 +1,39 @@
+{-# LANGUAGE FlexibleContexts           #-}
+{-# LANGUAGE OverloadedStrings          #-}
+{- |
+Module      : Text.Pandoc.Parsing.Base64
+Copyright   : © 2024 Evan Silberman
+License     : GPL-2.0-or-later
+Maintainer  : John MacFarlane <jgm@berkeley.edu>
+
+Parse large base64 strings efficiently within Pandoc's
+normal parsing environment
+-}
+
+module Text.Pandoc.Parsing.Base64
+  ( parseBase64String )
+
+where
+
+import Data.Text as T
+import Data.Attoparsec.Text as A
+import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn)
+import Text.Pandoc.Sources
+import Control.Monad (mzero)
+
+parseBase64String :: Monad m => ParsecT Sources u m Text
+parseBase64String = do
+  Sources ((pos, txt):rest) <- getInput
+  let r = A.parse pBase64 txt
+  case r of
+    Done remaining consumed -> do
+      let pos' = incSourceColumn pos (T.length consumed)
+      setInput $ Sources ((pos', remaining):rest)
+      return consumed
+    _ -> mzero
+
+pBase64 :: A.Parser Text
+pBase64 = do
+  most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/")
+  rest <- A.takeWhile (== '=')
+  return $ most <> rest
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
@@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Shared (
     addMetaField, extractSpaces, htmlSpanLikeElements, renderTags',
     safeRead, tshow, formatCode)
-import Text.Pandoc.URI (escapeURI)
+import Text.Pandoc.URI (escapeURI, isBase64DataURI)
 import Text.Pandoc.Walk
 import Text.TeXMath (readMathML, writeTeX)
 import qualified Data.Sequence as Seq
@@ -1219,8 +1219,10 @@ htmlTag f = try $ do
 
 -- | Adjusts a url according to the document's base URL.
 canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
-canonicalizeUrl url = do
-  mbBaseHref <- baseHref <$> getState
-  return $ case (parseURIReference (T.unpack url), mbBaseHref) of
-                (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
-                _                   -> url
+canonicalizeUrl url 
+  | isBase64DataURI url = return url
+  | otherwise = do
+     mbBaseHref <- baseHref <$> getState
+     return $ case (parseURIReference (T.unpack url), mbBaseHref) of
+                   (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
+                   _                   -> url
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1822,13 +1822,35 @@ parenthesizedChars = do
   result <- charsInBalanced '(' ')' litChar
   return $ "(" <> result <> ")"
 
+pBase64DataURI :: PandocMonad m => MarkdownParser m Text
+pBase64DataURI = mconcat <$> sequence
+  [ (textStr "data:")
+  , (T.singleton <$> (letter <|> digit))
+  , restrictedName
+  , (T.singleton <$> char '/')
+  , restrictedName
+  , textStr ";" <* trace "cool"
+  , (mconcat <$> many mediaParam)
+  , textStr "base64," <* trace "fine"
+  , parseBase64String
+  ]
+ where
+    restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-")
+    mediaParam = mconcat <$> sequence
+      [ notFollowedBy (textStr "base64,") *> mempty -- XXX ???
+      , restrictedName
+      , textStr "="
+      , manyChar (noneOf ";")
+      , textStr ";"
+      ]
+
 -- source for a link, with optional title
 source :: PandocMonad m => MarkdownParser m (Text, Text)
 source = do
   char '('
   skipSpaces
-  let urlChunk =
-            try parenthesizedChars
+  let urlChunk = try pBase64DataURI
+        <|> try parenthesizedChars
         <|> (notFollowedBy (oneOf " )") >> litChar)
         <|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')"))
   let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk

diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs
@@ -15,20 +15,26 @@ module Text.Pandoc.URI ( urlEncode
                        , isURI
                        , schemes
                        , uriPathToPath
+                       , isBase64DataURI
                        ) where
 import qualified Network.HTTP.Types as HTTP
 import qualified Text.Pandoc.UTF8 as UTF8
 import qualified Data.Text as T
+import qualified Data.Attoparsec.Text as A
 import qualified Data.Set as Set
 import Data.Char (isSpace, isAscii)
+import Data.Either (isRight)
+import Control.Applicative ((<|>))
 import Network.URI (URI (uriScheme), parseURI, escapeURIString)
 
 urlEncode :: T.Text -> T.Text
 urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText
 
 -- | Escape whitespace and some punctuation characters in URI.
 escapeURI :: T.Text -> T.Text
-escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack
+escapeURI t
+  | isBase64DataURI t = t
+  | otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t
   where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`"
 
 --
@@ -87,12 +93,38 @@ schemes = Set.fromList
   , "doi", "gemini", "isbn", "javascript", "pmid"
   ]
 
+isBase64DataURI :: T.Text -> Bool
+isBase64DataURI =
+  isRight . A.parseOnly pBase64DataURI
+
+pBase64DataURI :: A.Parser T.Text
+pBase64DataURI = mconcat <$> sequence
+  [ (A.string "data:")
+  , (T.singleton <$> (A.letter <|> A.digit))
+  , restrictedName
+  , (T.singleton <$> A.char '/')
+  , restrictedName
+  , (mconcat <$> A.many' mediaParam)
+  , A.string ";base64,"
+  , A.takeWhile1 (A.inClass "A-Za-z0-9+/")
+  , A.takeWhile (== '=')
+  ]
+ where
+    restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-")
+    mediaParam = mconcat <$> sequence
+      [ (T.singleton <$> A.char ';')
+      , restrictedName
+      , (T.singleton <$> A.char '=')
+      , A.takeWhile (/= ';') 
+      ]
+
 -- | Check if the string is a valid URL with a IANA or frequently used but
 -- unofficial scheme (see @schemes@).
 isURI :: T.Text -> Bool
-isURI =
-  -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
-  maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
+isURI t =
+  isBase64DataURI t ||
+    -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
+    (maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t
   where
     hasKnownScheme = (`Set.member` schemes) . T.toLower .
                      T.filter (/= ':') . T.pack . uriScheme