ansible-community · felixfontein · Aug 23, 2024 · Aug 22, 2024
diff --git a/changelogs/fragments/56-postprocess.yml b/changelogs/fragments/56-postprocess.yml
@@ -0,0 +1,4 @@
+minor_changes:
+  - "Apply postprocessing to RST and MarkDown to avoid generating invalid markup when input contains whitespace at potentially dangerous places (https://github.com/ansible-community/antsibull-docs-parser/pull/56)."
+bugfixes:
+  - "Fix RST escaping to handle other whitespace than spaces correctly (https://github.com/ansible-community/antsibull-docs-parser/pull/56)."
diff --git a/src/antsibull_docs_parser/format.py b/src/antsibull_docs_parser/format.py
@@ -212,6 +212,8 @@ def format_paragraphs(
     par_sep: str = "",
     par_empty: str = "",
     current_plugin: t.Optional[dom.PluginIdentifier] = None,
+    *,
+    postprocess_paragraph: t.Optional[t.Callable[[str], str]] = None,
 ) -> str:
     """
     Apply the formatter to all parts of the given paragraphs, concatenate the results,
@@ -223,14 +225,20 @@ def format_paragraphs(
     if link_provider is None:
         link_provider = _DefaultLinkProvider()
     result: t.List[str] = []
-    walker = _FormatWalker(result, formatter, link_provider, current_plugin)
     for paragraph in paragraphs:
         if result:
             result.append(par_sep)
         result.append(par_start)
-        before_len = len(result)
+
+        par_result: t.List[str] = []
+        walker = _FormatWalker(par_result, formatter, link_provider, current_plugin)
         dom.walk(paragraph, walker)
-        if before_len == len(result):
-            result.append(par_empty)
+        par = "".join(par_result)
+        if postprocess_paragraph:
+            par = postprocess_paragraph(par)
+        if not par:
+            par = par_empty
+        result.append(par)
+
         result.append(par_end)
     return "".join(result)
diff --git a/src/antsibull_docs_parser/md.py b/src/antsibull_docs_parser/md.py
@@ -101,6 +101,13 @@ def format_return_value(
 DEFAULT_FORMATTER = MDFormatter()
 
 
+def postprocess_md_paragraph(par: str) -> str:
+    lines = par.strip().splitlines()
+    lines = [line.strip().replace("\t", " ") for line in lines]
+    lines = [line for line in lines if line]
+    return "\n".join(lines)
+
+
 def to_md(
     paragraphs: t.Sequence[dom.Paragraph],
     formatter: Formatter = DEFAULT_FORMATTER,
@@ -120,4 +127,5 @@ def to_md(
         par_sep=par_sep,
         par_empty=par_empty,
         current_plugin=current_plugin,
+        postprocess_paragraph=postprocess_md_paragraph,
     )
diff --git a/src/antsibull_docs_parser/rst.py b/src/antsibull_docs_parser/rst.py
@@ -7,13 +7,17 @@
 ReStructured Text serialization.
 """
 
+import re
 import typing as t
 
 from . import dom
 from .format import Formatter, LinkProvider
 from .format import format_paragraphs as _format_paragraphs
 from .html import _url_escape
 
+_STARTING_WHITESPACE = re.compile(r"^\s")
+_ENDING_WHITESPACE = re.compile(r"\s$")
+
 
 def rst_escape(
     value: str,
@@ -29,9 +33,11 @@ def rst_escape(
     value = value.replace("*", "\\*")
     value = value.replace("`", "\\`")
 
-    if escape_ending_whitespace and value.endswith(" "):
+    # RST does not like it when the inside of `...` starts or ends with a whitespace
+    # (here, all kind of whitespaces count, not just spaces...)
+    if escape_ending_whitespace and _ENDING_WHITESPACE.match(value[-1:]):
         value = value + "\\ "
-    if escape_ending_whitespace and value.startswith(" "):
+    if escape_ending_whitespace and _STARTING_WHITESPACE.match(value):
         value = "\\ " + value
     if not value and must_not_be_empty:
         value = "\\ "
@@ -269,6 +275,106 @@ def format_return_value(
 DEFAULT_ANTSIBULL_FORMATTER = AntsibullRSTFormatter()
 DEFAULT_PLAIN_FORMATTER = PlainRSTFormatter()
 
+_BACKSLASH_SPACE_REPEAT = re.compile("\\\\ (?:\\\\ )+")
+_BACKSLASH_SPACE_REMOVER_PRE = re.compile("(?<![\\\\])([ ])\\\\ (?![`])")
+_BACKSLASH_SPACE_REMOVER_POST = re.compile("(?<!:`)\\\\ ([ .])")
+
+
+def _remove_backslash_space(line: str) -> str:
+    start = 0
+    end = len(line)
+
+    while True:
+        # Remove starting '\ '. These have no effect.
+        while line.startswith(r"\ ", start, end):
+            start += 2
+
+        # If the line now starts with regular whitespace, trim it.
+        if line.startswith(" ", start, end):
+            start += 1
+        else:
+            # If there is none, we're done.
+            break
+
+        # Remove more leading whitespace, and then check again for leading '\ ' etc.
+        while line.startswith(" ", start, end):
+            start += 1
+
+    while True:
+        # Remove trailing '\ ' resp. '\' (after line.strip()). These actually have an effect,
+        # since they remove the linebreak. *But* if our markup generator emits '\ ' followed
+        # by a line break, we still want the line break to count, so this is actually fixing
+        # a bug.
+        if line.endswith("\\", start, end):
+            end -= 1
+        while line.endswith(r"\ ", start, end):
+            end -= 2
+
+        # If the line now ends with regular whitespace, trim it.
+        if line.endswith(" ", start, end):
+            end -= 1
+        else:
+            # If there is none, we're done.
+            break
+
+        # Remove more ending whitespace, and then check again for trailing '\' etc.
+        while line.endswith(" ", start, end):
+            end -= 1
+
+    # Return subset of the line
+    line = line[start:end]
+    line = _BACKSLASH_SPACE_REPEAT.sub("\\\\ ", line)
+    line = _BACKSLASH_SPACE_REMOVER_POST.sub("\\1", line)
+    line = _BACKSLASH_SPACE_REMOVER_PRE.sub("\\1", line)
+    return line
+
+
+def _check_line(index: int, lines: t.List[str], line: str) -> bool:
+    if index < 0 or index >= len(lines):
+        return False
+    return lines[index] == line
+
+
+def _modify_line(index: int, line: str, lines: t.List[str]) -> bool:
+    raw_html = ".. raw:: html"
+    dashes = "------------"
+    hr = "  <hr>"
+    if line not in ("", raw_html, dashes, hr):
+        return True
+    if line in (raw_html, dashes):
+        return False
+    if line == hr and _check_line(index - 2, lines, raw_html):
+        return False
+    if line == "" and (
+        _check_line(index + 1, lines, raw_html)
+        or _check_line(index - 1, lines, raw_html)
+        or _check_line(index - 3, lines, raw_html)
+    ):
+        return False
+    if line == "" and (
+        _check_line(index + 1, lines, dashes) or _check_line(index - 1, lines, dashes)
+    ):
+        return False
+    return True
+
+
+def postprocess_rst_paragraph(par: str) -> str:
+    lines = par.strip().splitlines()
+    lines = [
+        (
+            _remove_backslash_space(line.strip().replace("\t", " "))
+            if _modify_line(index, line, lines)
+            else line
+        )
+        for index, line in enumerate(lines)
+    ]
+    lines = [
+        line
+        for index, line in enumerate(lines)
+        if line or not _modify_line(index, line, lines)
+    ]
+    return "\n".join(lines)
+
 
 def to_rst(
     paragraphs: t.Sequence[dom.Paragraph],
@@ -277,7 +383,7 @@ def to_rst(
     par_start: str = "",
     par_end: str = "",
     par_sep: str = "\n\n",
-    par_empty: str = r"\ ",
+    par_empty: str = "\\",
     current_plugin: t.Optional[dom.PluginIdentifier] = None,
 ) -> str:
     return _format_paragraphs(
@@ -289,6 +395,7 @@ def to_rst(
         par_sep=par_sep,
         par_empty=par_empty,
         current_plugin=current_plugin,
+        postprocess_paragraph=postprocess_rst_paragraph,
     )
 
 
@@ -299,7 +406,7 @@ def to_rst_plain(
     par_start: str = "",
     par_end: str = "",
     par_sep: str = "\n\n",
-    par_empty: str = r"\ ",
+    par_empty: str = "\\",
     current_plugin: t.Optional[dom.PluginIdentifier] = None,
 ) -> str:
     return _format_paragraphs(
@@ -311,4 +418,5 @@ def to_rst_plain(
         par_sep=par_sep,
         par_empty=par_empty,
         current_plugin=current_plugin,
+        postprocess_paragraph=postprocess_rst_paragraph,
     )