Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix RST escaping; add whitespace postprocessing for RST and MD #56

Merged
merged 1 commit into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelogs/fragments/56-postprocess.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
minor_changes:
- "Apply postprocessing to RST and MarkDown to avoid generating invalid markup when input contains whitespace at potentially dangerous places (https://github.com/ansible-community/antsibull-docs-parser/pull/56)."
bugfixes:
- "Fix RST escaping to handle other whitespace than spaces correctly (https://github.com/ansible-community/antsibull-docs-parser/pull/56)."
16 changes: 12 additions & 4 deletions src/antsibull_docs_parser/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ def format_paragraphs(
par_sep: str = "",
par_empty: str = "",
current_plugin: t.Optional[dom.PluginIdentifier] = None,
*,
postprocess_paragraph: t.Optional[t.Callable[[str], str]] = None,
) -> str:
"""
Apply the formatter to all parts of the given paragraphs, concatenate the results,
Expand All @@ -223,14 +225,20 @@ def format_paragraphs(
if link_provider is None:
link_provider = _DefaultLinkProvider()
result: t.List[str] = []
walker = _FormatWalker(result, formatter, link_provider, current_plugin)
for paragraph in paragraphs:
if result:
result.append(par_sep)
result.append(par_start)
before_len = len(result)

par_result: t.List[str] = []
walker = _FormatWalker(par_result, formatter, link_provider, current_plugin)
dom.walk(paragraph, walker)
if before_len == len(result):
result.append(par_empty)
par = "".join(par_result)
if postprocess_paragraph:
par = postprocess_paragraph(par)
if not par:
par = par_empty
result.append(par)

result.append(par_end)
return "".join(result)
8 changes: 8 additions & 0 deletions src/antsibull_docs_parser/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ def format_return_value(
DEFAULT_FORMATTER = MDFormatter()


def postprocess_md_paragraph(par: str) -> str:
lines = par.strip().splitlines()
lines = [line.strip().replace("\t", " ") for line in lines]
lines = [line for line in lines if line]
return "\n".join(lines)


def to_md(
paragraphs: t.Sequence[dom.Paragraph],
formatter: Formatter = DEFAULT_FORMATTER,
Expand All @@ -120,4 +127,5 @@ def to_md(
par_sep=par_sep,
par_empty=par_empty,
current_plugin=current_plugin,
postprocess_paragraph=postprocess_md_paragraph,
)
116 changes: 112 additions & 4 deletions src/antsibull_docs_parser/rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
ReStructured Text serialization.
"""

import re
import typing as t

from . import dom
from .format import Formatter, LinkProvider
from .format import format_paragraphs as _format_paragraphs
from .html import _url_escape

_STARTING_WHITESPACE = re.compile(r"^\s")
_ENDING_WHITESPACE = re.compile(r"\s$")


def rst_escape(
value: str,
Expand All @@ -29,9 +33,11 @@ def rst_escape(
value = value.replace("*", "\\*")
value = value.replace("`", "\\`")

if escape_ending_whitespace and value.endswith(" "):
# RST does not like it when the inside of `...` starts or ends with a whitespace
# (here, all kind of whitespaces count, not just spaces...)
if escape_ending_whitespace and _ENDING_WHITESPACE.match(value[-1:]):
value = value + "\\ "
if escape_ending_whitespace and value.startswith(" "):
if escape_ending_whitespace and _STARTING_WHITESPACE.match(value):
value = "\\ " + value
if not value and must_not_be_empty:
value = "\\ "
Expand Down Expand Up @@ -269,6 +275,106 @@ def format_return_value(
DEFAULT_ANTSIBULL_FORMATTER = AntsibullRSTFormatter()
DEFAULT_PLAIN_FORMATTER = PlainRSTFormatter()

_BACKSLASH_SPACE_REPEAT = re.compile("\\\\ (?:\\\\ )+")
_BACKSLASH_SPACE_REMOVER_PRE = re.compile("(?<![\\\\])([ ])\\\\ (?![`])")
_BACKSLASH_SPACE_REMOVER_POST = re.compile("(?<!:`)\\\\ ([ .])")


def _remove_backslash_space(line: str) -> str:
start = 0
end = len(line)

while True:
# Remove starting '\ '. These have no effect.
while line.startswith(r"\ ", start, end):
start += 2

# If the line now starts with regular whitespace, trim it.
if line.startswith(" ", start, end):
start += 1
else:
# If there is none, we're done.
break

# Remove more leading whitespace, and then check again for leading '\ ' etc.
while line.startswith(" ", start, end):
start += 1

while True:
# Remove trailing '\ ' resp. '\' (after line.strip()). These actually have an effect,
# since they remove the linebreak. *But* if our markup generator emits '\ ' followed
# by a line break, we still want the line break to count, so this is actually fixing
# a bug.
if line.endswith("\\", start, end):
end -= 1
while line.endswith(r"\ ", start, end):
end -= 2

# If the line now ends with regular whitespace, trim it.
if line.endswith(" ", start, end):
end -= 1
else:
# If there is none, we're done.
break

# Remove more ending whitespace, and then check again for trailing '\' etc.
while line.endswith(" ", start, end):
end -= 1

# Return subset of the line
line = line[start:end]
line = _BACKSLASH_SPACE_REPEAT.sub("\\\\ ", line)
line = _BACKSLASH_SPACE_REMOVER_POST.sub("\\1", line)
line = _BACKSLASH_SPACE_REMOVER_PRE.sub("\\1", line)
return line


def _check_line(index: int, lines: t.List[str], line: str) -> bool:
if index < 0 or index >= len(lines):
return False
return lines[index] == line


def _modify_line(index: int, line: str, lines: t.List[str]) -> bool:
raw_html = ".. raw:: html"
dashes = "------------"
hr = " <hr>"
if line not in ("", raw_html, dashes, hr):
return True
if line in (raw_html, dashes):
return False
if line == hr and _check_line(index - 2, lines, raw_html):
return False
if line == "" and (
_check_line(index + 1, lines, raw_html)
or _check_line(index - 1, lines, raw_html)
or _check_line(index - 3, lines, raw_html)
):
return False
if line == "" and (
_check_line(index + 1, lines, dashes) or _check_line(index - 1, lines, dashes)
):
return False
return True


def postprocess_rst_paragraph(par: str) -> str:
lines = par.strip().splitlines()
lines = [
(
_remove_backslash_space(line.strip().replace("\t", " "))
if _modify_line(index, line, lines)
else line
)
for index, line in enumerate(lines)
]
lines = [
line
for index, line in enumerate(lines)
if line or not _modify_line(index, line, lines)
]
return "\n".join(lines)


def to_rst(
paragraphs: t.Sequence[dom.Paragraph],
Expand All @@ -277,7 +383,7 @@ def to_rst(
par_start: str = "",
par_end: str = "",
par_sep: str = "\n\n",
par_empty: str = r"\ ",
par_empty: str = "\\",
current_plugin: t.Optional[dom.PluginIdentifier] = None,
) -> str:
return _format_paragraphs(
Expand All @@ -289,6 +395,7 @@ def to_rst(
par_sep=par_sep,
par_empty=par_empty,
current_plugin=current_plugin,
postprocess_paragraph=postprocess_rst_paragraph,
)


Expand All @@ -299,7 +406,7 @@ def to_rst_plain(
par_start: str = "",
par_end: str = "",
par_sep: str = "\n\n",
par_empty: str = r"\ ",
par_empty: str = "\\",
current_plugin: t.Optional[dom.PluginIdentifier] = None,
) -> str:
return _format_paragraphs(
Expand All @@ -311,4 +418,5 @@ def to_rst_plain(
par_sep=par_sep,
par_empty=par_empty,
current_plugin=current_plugin,
postprocess_paragraph=postprocess_rst_paragraph,
)
Loading