diff --git a/pandoc.cabal b/pandoc.cabal index 32ebcd43a6aa..812f620281e8 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -679,6 +679,7 @@ library Text.Pandoc.Class.Sandbox, Text.Pandoc.Filter.Environment, Text.Pandoc.Filter.JSON, + Text.Pandoc.Parsing.Base64, Text.Pandoc.Parsing.Capabilities, Text.Pandoc.Parsing.Citations, Text.Pandoc.Parsing.General, diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 67fcc363dfc4..548c28425869 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources, Column, ParseError, errorMessages, - messageString + messageString, + parseBase64String ) where @@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State SubstTable ) import Text.Pandoc.Parsing.Future ( askF, asksF, returnF, runF, Future(..) ) +import Text.Pandoc.Parsing.Base64 (parseBase64String) diff --git a/src/Text/Pandoc/Parsing/Base64.hs b/src/Text/Pandoc/Parsing/Base64.hs new file mode 100644 index 000000000000..4330a83d14de --- /dev/null +++ b/src/Text/Pandoc/Parsing/Base64.hs @@ -0,0 +1,39 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Base64 +Copyright : © 2024 Evan Silberman +License : GPL-2.0-or-later +Maintainer : John MacFarlane + +Parse large base64 strings efficiently within Pandoc's +normal parsing environment +-} + +module Text.Pandoc.Parsing.Base64 + ( parseBase64String ) + +where + +import Data.Text as T +import Data.Attoparsec.Text as A +import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn) +import Text.Pandoc.Sources +import Control.Monad (mzero) + +parseBase64String :: Monad m => ParsecT Sources u m Text +parseBase64String = do + Sources ((pos, txt):rest) <- getInput + let r = A.parse pBase64 txt + case r of + Done remaining consumed -> do + let pos' = incSourceColumn pos (T.length consumed) + setInput $ Sources ((pos', remaining):rest) + return consumed + _ -> mzero + +pBase64 :: A.Parser Text +pBase64 = do + most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/") + rest <- A.takeWhile (== '=') + return $ most <> rest diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 9704b047e0f1..ff6bbd881dd8 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared ( addMetaField, extractSpaces, htmlSpanLikeElements, renderTags', safeRead, tshow, formatCode) -import Text.Pandoc.URI (escapeURI) +import Text.Pandoc.URI (escapeURI, isBase64DataURI) import Text.Pandoc.Walk import Text.TeXMath (readMathML, writeTeX) import qualified Data.Sequence as Seq @@ -1219,8 +1219,10 @@ htmlTag f = try $ do -- | Adjusts a url according to the document's base URL. canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text -canonicalizeUrl url = do - mbBaseHref <- baseHref <$> getState - return $ case (parseURIReference (T.unpack url), mbBaseHref) of - (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) - _ -> url +canonicalizeUrl url + | isBase64DataURI url = return url + | otherwise = do + mbBaseHref <- baseHref <$> getState + return $ case (parseURIReference (T.unpack url), mbBaseHref) of + (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) + _ -> url diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 51782f00015f..b01c937f7c92 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1822,13 +1822,35 @@ parenthesizedChars = do result <- charsInBalanced '(' ')' litChar return $ "(" <> result <> ")" +pBase64DataURI :: PandocMonad m => MarkdownParser m Text +pBase64DataURI = mconcat <$> sequence + [ (textStr "data:") + , (T.singleton <$> (letter <|> digit)) + , restrictedName + , (T.singleton <$> char '/') + , restrictedName + , textStr ";" <* trace "cool" + , (mconcat <$> many mediaParam) + , textStr "base64," <* trace "fine" + , parseBase64String + ] + where + restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-") + mediaParam = mconcat <$> sequence + [ notFollowedBy (textStr "base64,") *> mempty -- XXX ??? + , restrictedName + , textStr "=" + , manyChar (noneOf ";") + , textStr ";" + ] + -- source for a link, with optional title source :: PandocMonad m => MarkdownParser m (Text, Text) source = do char '(' skipSpaces - let urlChunk = - try parenthesizedChars + let urlChunk = try pBase64DataURI + <|> try parenthesizedChars <|> (notFollowedBy (oneOf " )") >> litChar) <|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')")) let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs index a0b47d259cd5..5052b4c6c514 100644 --- a/src/Text/Pandoc/URI.hs +++ b/src/Text/Pandoc/URI.hs @@ -15,12 +15,16 @@ module Text.Pandoc.URI ( urlEncode , isURI , schemes , uriPathToPath + , isBase64DataURI ) where import qualified Network.HTTP.Types as HTTP import qualified Text.Pandoc.UTF8 as UTF8 import qualified Data.Text as T +import qualified Data.Attoparsec.Text as A import qualified Data.Set as Set import Data.Char (isSpace, isAscii) +import Data.Either (isRight) +import Control.Applicative ((<|>)) import Network.URI (URI (uriScheme), parseURI, escapeURIString) urlEncode :: T.Text -> T.Text @@ -28,7 +32,9 @@ urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText -- | Escape whitespace and some punctuation characters in URI. escapeURI :: T.Text -> T.Text -escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack +escapeURI t + | isBase64DataURI t = t + | otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`" -- @@ -87,12 +93,38 @@ schemes = Set.fromList , "doi", "gemini", "isbn", "javascript", "pmid" ] +isBase64DataURI :: T.Text -> Bool +isBase64DataURI = + isRight . A.parseOnly pBase64DataURI + +pBase64DataURI :: A.Parser T.Text +pBase64DataURI = mconcat <$> sequence + [ (A.string "data:") + , (T.singleton <$> (A.letter <|> A.digit)) + , restrictedName + , (T.singleton <$> A.char '/') + , restrictedName + , (mconcat <$> A.many' mediaParam) + , A.string ";base64," + , A.takeWhile1 (A.inClass "A-Za-z0-9+/") + , A.takeWhile (== '=') + ] + where + restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-") + mediaParam = mconcat <$> sequence + [ (T.singleton <$> A.char ';') + , restrictedName + , (T.singleton <$> A.char '=') + , A.takeWhile (/= ';') + ] + -- | Check if the string is a valid URL with a IANA or frequently used but -- unofficial scheme (see @schemes@). isURI :: T.Text -> Bool -isURI = - -- we URI-escape non-ASCII characters because otherwise parseURI will choke: - maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack +isURI t = + isBase64DataURI t || + -- we URI-escape non-ASCII characters because otherwise parseURI will choke: + (maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t where hasKnownScheme = (`Set.member` schemes) . T.toLower . T.filter (/= ':') . T.pack . uriScheme