Skip to content

Commit

Permalink
Fix RST escaping, add whitespace postprocessing for RST and MD.
Browse files Browse the repository at this point in the history
  • Loading branch information
felixfontein committed Aug 23, 2024
1 parent 2d8d836 commit 7342245
Show file tree
Hide file tree
Showing 7 changed files with 760 additions and 488 deletions.
4 changes: 4 additions & 0 deletions changelogs/fragments/56-postprocess.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
minor_changes:
- "Apply postprocessing to RST and MarkDown to avoid generating invalid markup when input contains whitespace at potentially dangerous places (https://github.com/ansible-community/antsibull-docs-parser/pull/56)."
bugfixes:
- "Fix RST escaping to handle other whitespace than spaces correctly (https://github.com/ansible-community/antsibull-docs-parser/pull/56)."
16 changes: 12 additions & 4 deletions src/antsibull_docs_parser/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ def format_paragraphs(
par_sep: str = "",
par_empty: str = "",
current_plugin: t.Optional[dom.PluginIdentifier] = None,
*,
postprocess_paragraph: t.Optional[t.Callable[[str], str]] = None,
) -> str:
"""
Apply the formatter to all parts of the given paragraphs, concatenate the results,
Expand All @@ -223,14 +225,20 @@ def format_paragraphs(
if link_provider is None:
link_provider = _DefaultLinkProvider()
result: t.List[str] = []
walker = _FormatWalker(result, formatter, link_provider, current_plugin)
for paragraph in paragraphs:
if result:
result.append(par_sep)
result.append(par_start)
before_len = len(result)

par_result: t.List[str] = []
walker = _FormatWalker(par_result, formatter, link_provider, current_plugin)
dom.walk(paragraph, walker)
if before_len == len(result):
result.append(par_empty)
par = "".join(par_result)
if postprocess_paragraph:
par = postprocess_paragraph(par)
if not par:
par = par_empty
result.append(par)

result.append(par_end)
return "".join(result)
8 changes: 8 additions & 0 deletions src/antsibull_docs_parser/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ def format_return_value(
DEFAULT_FORMATTER = MDFormatter()


def postprocess_md_paragraph(par: str) -> str:
lines = par.strip().splitlines()
lines = [line.strip().replace("\t", " ") for line in lines]
lines = [line for line in lines if line]
return "\n".join(lines)


def to_md(
paragraphs: t.Sequence[dom.Paragraph],
formatter: Formatter = DEFAULT_FORMATTER,
Expand All @@ -120,4 +127,5 @@ def to_md(
par_sep=par_sep,
par_empty=par_empty,
current_plugin=current_plugin,
postprocess_paragraph=postprocess_md_paragraph,
)
116 changes: 112 additions & 4 deletions src/antsibull_docs_parser/rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
ReStructured Text serialization.
"""

import re
import typing as t

from . import dom
from .format import Formatter, LinkProvider
from .format import format_paragraphs as _format_paragraphs
from .html import _url_escape

_STARTING_WHITESPACE = re.compile(r"^\s")
_ENDING_WHITESPACE = re.compile(r"\s$")


def rst_escape(
value: str,
Expand All @@ -29,9 +33,11 @@ def rst_escape(
value = value.replace("*", "\\*")
value = value.replace("`", "\\`")

if escape_ending_whitespace and value.endswith(" "):
# RST does not like it when the inside of `...` starts or ends with a whitespace
# (here, all kind of whitespaces count, not just spaces...)
if escape_ending_whitespace and _ENDING_WHITESPACE.match(value[-1:]):
value = value + "\\ "
if escape_ending_whitespace and value.startswith(" "):
if escape_ending_whitespace and _STARTING_WHITESPACE.match(value):
value = "\\ " + value
if not value and must_not_be_empty:
value = "\\ "
Expand Down Expand Up @@ -269,6 +275,106 @@ def format_return_value(
DEFAULT_ANTSIBULL_FORMATTER = AntsibullRSTFormatter()
DEFAULT_PLAIN_FORMATTER = PlainRSTFormatter()

_BACKSLASH_SPACE_REPEAT = re.compile("\\\\ (?:\\\\ )+")
_BACKSLASH_SPACE_REMOVER_PRE = re.compile("(?<![\\\\])([ ])\\\\ (?![`])")
_BACKSLASH_SPACE_REMOVER_POST = re.compile("(?<!:`)\\\\ ([ .])")


def _remove_backslash_space(line: str) -> str:
start = 0
end = len(line)

while True:
# Remove starting '\ '. These have no effect.
while line.startswith(r"\ ", start, end):
start += 2

# If the line now starts with regular whitespace, trim it.
if line.startswith(" ", start, end):
start += 1
else:
# If there is none, we're done.
break

# Remove more leading whitespace, and then check again for leading '\ ' etc.
while line.startswith(" ", start, end):
start += 1

while True:
# Remove trailing '\ ' resp. '\' (after line.strip()). These actually have an effect,
# since they remove the linebreak. *But* if our markup generator emits '\ ' followed
# by a line break, we still want the line break to count, so this is actually fixing
# a bug.
if line.endswith("\\", start, end):
end -= 1
while line.endswith(r"\ ", start, end):
end -= 2

# If the line now ends with regular whitespace, trim it.
if line.endswith(" ", start, end):
end -= 1
else:
# If there is none, we're done.
break

# Remove more ending whitespace, and then check again for trailing '\' etc.
while line.endswith(" ", start, end):
end -= 1

# Return subset of the line
line = line[start:end]
line = _BACKSLASH_SPACE_REPEAT.sub("\\\\ ", line)
line = _BACKSLASH_SPACE_REMOVER_POST.sub("\\1", line)
line = _BACKSLASH_SPACE_REMOVER_PRE.sub("\\1", line)
return line


def _check_line(index: int, lines: t.List[str], line: str) -> bool:
if index < 0 or index >= len(lines):
return False
return lines[index] == line


def _modify_line(index: int, line: str, lines: t.List[str]) -> bool:
raw_html = ".. raw:: html"
dashes = "------------"
hr = " <hr>"
if line not in ("", raw_html, dashes, hr):
return True
if line in (raw_html, dashes):
return False
if line == hr and _check_line(index - 2, lines, raw_html):
return False
if line == "" and (
_check_line(index + 1, lines, raw_html)
or _check_line(index - 1, lines, raw_html)
or _check_line(index - 3, lines, raw_html)
):
return False
if line == "" and (
_check_line(index + 1, lines, dashes) or _check_line(index - 1, lines, dashes)
):
return False
return True


def postprocess_rst_paragraph(par: str) -> str:
lines = par.strip().splitlines()
lines = [
(
_remove_backslash_space(line.strip().replace("\t", " "))
if _modify_line(index, line, lines)
else line
)
for index, line in enumerate(lines)
]
lines = [
line
for index, line in enumerate(lines)
if line or not _modify_line(index, line, lines)
]
return "\n".join(lines)


def to_rst(
paragraphs: t.Sequence[dom.Paragraph],
Expand All @@ -277,7 +383,7 @@ def to_rst(
par_start: str = "",
par_end: str = "",
par_sep: str = "\n\n",
par_empty: str = r"\ ",
par_empty: str = "\\",
current_plugin: t.Optional[dom.PluginIdentifier] = None,
) -> str:
return _format_paragraphs(
Expand All @@ -289,6 +395,7 @@ def to_rst(
par_sep=par_sep,
par_empty=par_empty,
current_plugin=current_plugin,
postprocess_paragraph=postprocess_rst_paragraph,
)


Expand All @@ -299,7 +406,7 @@ def to_rst_plain(
par_start: str = "",
par_end: str = "",
par_sep: str = "\n\n",
par_empty: str = r"\ ",
par_empty: str = "\\",
current_plugin: t.Optional[dom.PluginIdentifier] = None,
) -> str:
return _format_paragraphs(
Expand All @@ -311,4 +418,5 @@ def to_rst_plain(
par_sep=par_sep,
par_empty=par_empty,
current_plugin=current_plugin,
postprocess_paragraph=postprocess_rst_paragraph,
)
Loading

0 comments on commit 7342245

Please sign in to comment.