Skip to content

Commit

Permalink
Fix display of block elements in JATS reader (PR #8971)
Browse files Browse the repository at this point in the history
A number of block elements, like disp-quote, list, and disp-formula, were always treated as inlines if appearing inside paragraphs, even if their usage granted a separate block. The function isElementBlock has been refined to prevent this, and a number of specific parse cases have been added to parseBlock.

Also, some minimal cleanup of the test file, in order for it to pass XML validation against the JATS DTD 1.3 (it was not compliant with the current or any previous versions of JATS).

Closes #8889.
  • Loading branch information
kamoe authored Aug 10, 2023
1 parent cbb33fe commit 6673f83
Show file tree
Hide file tree
Showing 4 changed files with 1,869 additions and 1,198 deletions.
103 changes: 65 additions & 38 deletions src/Text/Pandoc/Readers/JATS.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Conversion of JATS XML to 'Pandoc' document.

module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict ( StateT(runStateT), gets, modify )
import Control.Monad (forM_, when, unless, MonadPlus(mplus))
import Control.Monad (forM_, when, unless)
import Control.Monad.Except (throwError)
import Text.Pandoc.Error (PandocError(..))
import Data.Char (isDigit, isSpace)
Expand Down Expand Up @@ -105,31 +105,30 @@ instance HasMeta JATSState where
deleteMeta field s = s {jatsMeta = deleteMeta field (jatsMeta s)}

isBlockElement :: Content -> Bool
isBlockElement (Elem e) = qName (elName e) `S.member` blocktags
where blocktags = S.fromList (paragraphLevel ++ lists ++ mathML ++ other) \\ S.fromList inlinetags
paragraphLevel = ["address", "array", "boxed-text", "chem-struct-wrap",
"code", "fig", "fig-group", "graphic", "media", "preformat",
isBlockElement (Elem e) = case qName (elName e) of
"disp-formula" -> if onlyOneChild e
then if hasFormulaChild e
then False
else case filterChild (named "alternatives") e of
Just a -> if hasFormulaChild a then False else True
Nothing -> True
else True
"alternatives" -> if hasFormulaChild e then False else True
_ -> qName (elName e) `S.member` blocktags

where blocktags = S.fromList (paragraphLevel ++ lists ++ formulae ++ other) \\ S.fromList canBeInline
paragraphLevel = ["address", "answer", "answer-set", "array", "boxed-text", "chem-struct-wrap",
"code", "explanation", "fig", "fig-group", "graphic", "media", "preformat", "question", "question-wrap", "question-wrap-group",
"supplementary-material", "table-wrap", "table-wrap-group",
"alternatives", "disp-formula", "disp-formula-group"]
lists = ["def-list", "list"]
mathML = ["tex-math", "mml:math"]
formulae = ["tex-math", "mml:math"]
other = ["p", "related-article", "related-object", "ack", "disp-quote",
"speech", "statement", "verse-group", "x"]
inlinetags = ["email", "ext-link", "uri", "inline-supplementary-material",
"related-article", "related-object", "hr", "bold", "fixed-case",
"italic", "monospace", "overline", "overline-start", "overline-end",
"roman", "sans-serif", "sc", "strike", "underline", "underline-start",
"underline-end", "ruby", "alternatives", "inline-graphic", "private-char",
"chem-struct", "inline-formula", "tex-math", "mml:math", "abbrev",
"milestone-end", "milestone-start", "named-content", "styled-content",
"fn", "target", "xref", "sub", "sup", "x", "address", "array",
"boxed-text", "chem-struct-wrap", "code", "fig", "fig-group", "graphic",
"media", "preformat", "supplementary-material", "table-wrap",
"table-wrap-group", "disp-formula", "disp-formula-group",
"citation-alternatives", "element-citation", "mixed-citation",
"nlm-citation", "award-id", "funding-source", "open-access",
"def-list", "list", "ack", "disp-quote", "speech", "statement",
"verse-group"]
canBeInline = ["tex-math", "mml:math", "related-object", "x"]
onlyOneChild x = length (allChildren x) == 1
allChildren x = filterChildren (const True) x

isBlockElement _ = False

-- Trim leading and trailing newline characters
Expand Down Expand Up @@ -217,6 +216,13 @@ parseBlock (Elem e) = do
else divWith (attrValue "id" e, ["caption"], []) <$> wrapWithHeader 6 (getBlocks e)
"fn-group" -> parseFootnoteGroup
"ref-list" -> parseRefList e
"alternatives" -> if hasFormulaChild e
then blockFormula displayMath e
else getBlocks e
"disp-formula" -> if hasFormulaChild e
then blockFormula displayMath e
else divWith (attrValue "id" e, ["disp-formula"], [])
<$> getBlocks e
"?xml" -> return mempty
_ -> getBlocks e
where parseMixed container conts = do
Expand Down Expand Up @@ -363,9 +369,7 @@ parseBlock (Elem e) = do
wrapWithHeader n mBlocks = do
isBook <- gets jatsBook
let n' = if isBook || n == 0 then n + 1 else n
headerText <- case filterChild (named "title") e `mplus`
(filterChild (named "info") e >>=
filterChild (named "title")) of
headerText <- case filterChild (named "title") e of
Just t -> getInlines t
Nothing -> return mempty
oldN <- gets jatsSectionLevel
Expand Down Expand Up @@ -602,8 +606,11 @@ parseInline (Elem e) =
let attr = (attrValue "id" e, [], [])
return $ linkWith attr href title ils'

"disp-formula" -> formula displayMath
"inline-formula" -> formula math
"alternatives" -> if hasFormulaChild e
then inlineFormula math e
else innerInlines id
"disp-formula" -> inlineFormula displayMath e
"inline-formula" -> inlineFormula math e
"math" | qURI (elName e) == Just "http://www.w3.org/1998/Math/MathML"
-> return . math $ mathML e
"tex-math" -> return . math $ textContent e
Expand All @@ -616,24 +623,44 @@ parseInline (Elem e) =
_ -> innerInlines id
where innerInlines f = extractSpaces f . mconcat <$>
mapM parseInline (elContent e)
mathML x =
case readMathML . showElement $ everywhere (mkT removePrefix) x of
Left _ -> mempty
Right m -> writeTeX m
formula constructor = do
codeWithLang = do
let classes' = case attrValue "language" e of
"" -> []
l -> [l]
return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e

inlineFormula :: PandocMonad m => (Text->Inlines) -> Element -> JATS m Inlines
inlineFormula constructor e = do
let whereToLook = fromMaybe e $ filterElement (named "alternatives") e
texMaths = map textContent $
filterChildren (named "tex-math") whereToLook
mathMLs = map mathML $
filterChildren isMathML whereToLook
return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs

isMathML x = qName (elName x) == "math" &&
blockFormula :: PandocMonad m => (Text->Inlines) -> Element -> JATS m Blocks
blockFormula constructor e = do
let whereToLook = fromMaybe e $ filterElement (named "alternatives") e
texMaths = map textContent $
filterChildren (named "tex-math") whereToLook
mathMLs = map mathML $
filterChildren isMathML whereToLook
return . para . head . take 1 . map constructor $ texMaths ++ mathMLs

mathML :: Element -> Text
mathML x =
case readMathML . showElement $ everywhere (mkT removePrefix) x of
Left _ -> mempty
Right m -> writeTeX m
where removePrefix elname = elname { qPrefix = Nothing }

isMathML :: Element -> Bool
isMathML x = qName (elName x) == "math" &&
qURI (elName x) ==
Just "http://www.w3.org/1998/Math/MathML"
removePrefix elname = elname { qPrefix = Nothing }
codeWithLang = do
let classes' = case attrValue "language" e of
"" -> []
l -> [l]
return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e

formulaChildren :: Element -> [Element]
formulaChildren x = filterChildren isMathML x ++ filterChildren (named "tex-math") x

hasFormulaChild :: Element -> Bool
hasFormulaChild x = length (formulaChildren x) > 0
1 change: 0 additions & 1 deletion test/Tests/Readers/JATS.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ jats = purely $ readJATS def
tests :: [TestTree]
tests = [ testGroup "inline code"
[ test jats "basic" $ "<p>\n <monospace>@&amp;</monospace>\n</p>" =?> para (code "@&")
, test jats "lang" $ "<p>\n <code language=\"c\">@&amp;</code>\n</p>" =?> para (codeWith ("", ["c"], []) "@&")
]
, testGroup "block code"
[ test jats "basic" $ "<preformat>@&amp;</preformat>" =?> codeBlock "@&"
Expand Down
Loading

0 comments on commit 6673f83

Please sign in to comment.