Skip to content

Commit

Permalink
fix: HTML headings that span across multiple lines are now matched
Browse files Browse the repository at this point in the history
test: Added HTML tests
  • Loading branch information
AlphaJack committed Mar 17, 2024
1 parent 886b791 commit 1f0d11b
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 6 deletions.
4 changes: 3 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ Steps for a new release:
5. Run `toc -lf .tocfiles`
6. Remove tag with `git tag --delete v2.6.0`
7. Add changelog changes with `git add CHANGELOG.md && git commit -m "minor: updated CHANGELOG.md"`
8. Move tag to the new commit with `git tag -fa v2.6.0`
8. Move tag to the new commit with `git tag -fa v2.6.0`
9. Upload the new commits and tags with `git push --follow-tags`
10. Update [AUR](https://aur.archlinux.org/packages/toc) version once the new [PyPI](https://pypi.org/project/tableofcontents/) version is online

In case a tag has been pushed to GitHub, but the release failed, run `git push --delete origin v2.6.0` and repeat the steps above
25 changes: 25 additions & 0 deletions tests/input/html_tags.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<!doctype html>
<!-- simple -->
<h1>

<br/>

Title


</h1>

<!-- https://docs.python.org/3/library/re.html -->
<h2>Common Problems<a class="headerlink" href="#common-problems" title="Link to this heading"></a></h2>

<!-- https://www.regular-expressions.info/catastrophic.html -->
<h2>Quickly Matching a Complete HTML File</h2> <p>Another common situation where catastrophic backtracking

<!-- https://py-pkgs.org/03-how-to-package-a-python -->
<span id=id2></span><h3><span class=section-number>3.1.1. </span>Developing our code<a class=headerlink href=#developing-our-code title="Permalink to this heading">#</a></h3>

<!-- https://access.redhat.com/documentation/en-us/openshift_container_platform/4.9/html/security_and_compliance/network-bound-disk-encryption-nbde#nbde-tang-server-installation-considerations -->
<h4 id="nav__tools__red-hat-insights">
<a data-analytics-level="2" data-analytics-category="Tools" data-analytics-text="Red Hat Insights" href="//www.redhat.com/en/technologies/management/insights">Red Hat Insights
</a>
</h4>
39 changes: 39 additions & 0 deletions tests/reference/html_tags.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<!doctype html>

<!--
// ┌───────────────────────────────────────────────────────────────┐
// │ Contents of html_tags.html │
// ├───────────────────────────────────────────────────────────────┘
// │
// ├──┐Title
// │ ├── Common Problems¶
// │ └──┐Quickly Matching a Complete HTML File
// │ └──┐3.1.1. Developing our code#
// │ └── Red Hat Insights
// │
// └───────────────────────────────────────────────────────────────
-->
<!-- simple -->
<h1>

<br/>

Title


</h1>

<!-- https://docs.python.org/3/library/re.html -->
<h2>Common Problems<a class="headerlink" href="#common-problems" title="Link to this heading"></a></h2>

<!-- https://www.regular-expressions.info/catastrophic.html -->
<h2>Quickly Matching a Complete HTML File</h2> <p>Another common situation where catastrophic backtracking

<!-- https://py-pkgs.org/03-how-to-package-a-python -->
<span id=id2></span><h3><span class=section-number>3.1.1. </span>Developing our code<a class=headerlink href=#developing-our-code title="Permalink to this heading">#</a></h3>

<!-- https://access.redhat.com/documentation/en-us/openshift_container_platform/4.9/html/security_and_compliance/network-bound-disk-encryption-nbde#nbde-tang-server-installation-considerations -->
<h4 id="nav__tools__red-hat-insights">
<a data-analytics-level="2" data-analytics-category="Tools" data-analytics-text="Red Hat Insights" href="//www.redhat.com/en/technologies/management/insights">Red Hat Insights
</a>
</h4>
14 changes: 9 additions & 5 deletions toc/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(
def set_character(self) -> str:
# automatically select the comment type from its extension, if not already set
match self.extension:
case "ad" | "adoc" | "asc" | "asciidoc" | "c" | "carbon" | "cc" | "coffee" | "cpp" | "cs" | "css" | "d" | "dart" | "go" | "h" | "hpp" | "htm" | "html" | "hxx" | "java" | "js" | "jsx" | "kt" | "md" | "mdx" | "qmd" | "rmd" | "pas" | "php" | "pp" | "proto" | "qs" | "rs" | "scala" | "sc" | "swift" | "ts" | "typ" | "xml" | "zig":
case "ad" | "adoc" | "asc" | "asciidoc" | "c" | "carbon" | "cc" | "coffee" | "cpp" | "cs" | "css" | "cu" | "d" | "dart" | "go" | "h" | "hpp" | "htm" | "html" | "hxx" | "java" | "js" | "jsx" | "kt" | "md" | "mdx" | "qmd" | "rmd" | "pas" | "php" | "pp" | "proto" | "qs" | "rs" | "scala" | "sc" | "swift" | "ts" | "typ" | "xml" | "zig":
self.character = "//"
case "ahk" | "asm" | "beancount" | "cl" | "clj" | "cljs" | "cljc" | "edn" | "fasl" | "ini" | "lisp" | "lsp" | "rkt" | "scm" | "ss":
self.character = ";"
Expand Down Expand Up @@ -442,11 +442,14 @@ def _process_increasing(self, lines: list, heading_character: str) -> list:
return _newtoc

# #### HTML

# every time an html page is parsed with regex, a software engineer dies
def _process_html(self, data: str) -> list:
_newtoc = []
_pattern = re.compile(r"<h(\d).*?>(?:<.*?>)?(.*?)</.*?h\d", re.MULTILINE)
# every time an html page is parsed with regex, a software engineer dies
# _pattern = re.compile(r"<h(\d).*?>(?:<.*?>)?(.*?)</.*?h\d", re.MULTILINE)
# https://learnbyexample.github.io/python-regex-possessive-quantifier/
# _pattern = re.compile(r"<h(\d)(?>.*?)>?(?:\s)*(?:<.*?>)?(?:\s)*(.*?)(?:\s)*</(.*?)h\d>", re.MULTILINE)
# _pattern = re.compile(r"<h(\d).*?>.*?>?(?:\s)*(?:<.*?>)?(?:\s)*(.*?)(?:\s)*</.*?h\d>", re.DOTALL)
_pattern = re.compile(r"<[hH](\d).*?>.*?>?(?:\s)*(?:<.*?>)?(?:\s)*(.*?)(?:\s)*</[hH]\d>", re.DOTALL)
# _matches = _pattern.finditer(data)
# print(sum(1 for _ in _matches))
# for _match in _matches:
Expand All @@ -462,7 +465,8 @@ def _process_html(self, data: str) -> list:
_heading_level = int(_match.group(1))
# in case there are fancy tags or other elements inside the title, we remove them
# blood for the blood god
_heading_text = re.sub(r"<.*?>", "", _match.group(2).strip())
_heading_text = re.sub(r"<.*?>", "", _match.group(2)).strip()
# print(f"Heading text: '{_heading_text}'")
if self.lineNumbers:
# return the character number, not the line number
_untilCurrentMatch = _match.start(0)
Expand Down

0 comments on commit 1f0d11b

Please sign in to comment.