fix: HTML headings that span across multiple lines are now matched

test: Added HTML tests
AlphaJack · Mar 17, 2024 · 1f0d11b · 1f0d11b
1 parent 886b791
commit 1f0d11b
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 6 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -216,6 +216,8 @@ Steps for a new release:
 5. Run `toc -lf .tocfiles`
 6. Remove tag with `git tag --delete v2.6.0`
 7. Add changelog changes with `git add CHANGELOG.md && git commit -m "minor: updated CHANGELOG.md"`
-8. Move tag to the new commit with `git tag -fa  v2.6.0`
+8. Move tag to the new commit with `git tag -fa v2.6.0`
 9. Upload the new commits and tags with `git push --follow-tags`
 10. Update [AUR](https://aur.archlinux.org/packages/toc) version once the new [PyPI](https://pypi.org/project/tableofcontents/) version is online
+
+In case a tag has been pushed to GitHub, but the release failed, run `git push --delete origin v2.6.0` and repeat the steps above
diff --git a/tests/input/html_tags.html b/tests/input/html_tags.html
@@ -0,0 +1,25 @@
+<!doctype html>
+<!-- simple -->
+<h1> 
+
+<br/>
+
+Title
+
+
+   </h1>
+
+<!-- https://docs.python.org/3/library/re.html -->
+<h2>Common Problems<a class="headerlink" href="#common-problems" title="Link to this heading">¶</a></h2>
+
+<!-- https://www.regular-expressions.info/catastrophic.html -->
+<h2>Quickly Matching a Complete HTML File</h2> <p>Another common situation where catastrophic backtracking
+
+<!-- https://py-pkgs.org/03-how-to-package-a-python -->
+<span id=id2></span><h3><span class=section-number>3.1.1. </span>Developing our code<a class=headerlink href=#developing-our-code title="Permalink to this heading">#</a></h3>
+
+<!-- https://access.redhat.com/documentation/en-us/openshift_container_platform/4.9/html/security_and_compliance/network-bound-disk-encryption-nbde#nbde-tang-server-installation-considerations -->
+                                   <h4 id="nav__tools__red-hat-insights">
+                                        <a data-analytics-level="2" data-analytics-category="Tools" data-analytics-text="Red Hat Insights" href="//www.redhat.com/en/technologies/management/insights">Red Hat Insights
+                                        </a>
+                                    </h4>
diff --git a/tests/reference/html_tags.html b/tests/reference/html_tags.html
@@ -0,0 +1,39 @@
+<!doctype html>
+
+<!--
+// ┌───────────────────────────────────────────────────────────────┐
+// │ Contents of html_tags.html                                    │
+// ├───────────────────────────────────────────────────────────────┘
+// │
+// ├──┐Title
+// │  ├── Common Problems¶
+// │  └──┐Quickly Matching a Complete HTML File
+// │     └──┐3.1.1. Developing our code#
+// │        └── Red Hat Insights
+// │
+// └───────────────────────────────────────────────────────────────
+-->
+<!-- simple -->
+<h1> 
+
+<br/>
+
+Title
+
+
+   </h1>
+
+<!-- https://docs.python.org/3/library/re.html -->
+<h2>Common Problems<a class="headerlink" href="#common-problems" title="Link to this heading">¶</a></h2>
+
+<!-- https://www.regular-expressions.info/catastrophic.html -->
+<h2>Quickly Matching a Complete HTML File</h2> <p>Another common situation where catastrophic backtracking
+
+<!-- https://py-pkgs.org/03-how-to-package-a-python -->
+<span id=id2></span><h3><span class=section-number>3.1.1. </span>Developing our code<a class=headerlink href=#developing-our-code title="Permalink to this heading">#</a></h3>
+
+<!-- https://access.redhat.com/documentation/en-us/openshift_container_platform/4.9/html/security_and_compliance/network-bound-disk-encryption-nbde#nbde-tang-server-installation-considerations -->
+                                   <h4 id="nav__tools__red-hat-insights">
+                                        <a data-analytics-level="2" data-analytics-category="Tools" data-analytics-text="Red Hat Insights" href="//www.redhat.com/en/technologies/management/insights">Red Hat Insights
+                                        </a>
+                                    </h4>
diff --git a/toc/toc.py b/toc/toc.py
@@ -75,7 +75,7 @@ def __init__(
     def set_character(self) -> str:
         # automatically select the comment type from its extension, if not already set
         match self.extension:
-            case "ad" | "adoc" | "asc" | "asciidoc" | "c" | "carbon" | "cc" | "coffee" | "cpp" | "cs" | "css" | "d" | "dart" | "go" | "h" | "hpp" | "htm" | "html" | "hxx" | "java" | "js" | "jsx" | "kt" | "md" | "mdx" | "qmd" | "rmd" | "pas" | "php" | "pp" | "proto" | "qs" | "rs" | "scala" | "sc" | "swift" | "ts" | "typ" | "xml" | "zig":
+            case "ad" | "adoc" | "asc" | "asciidoc" | "c" | "carbon" | "cc" | "coffee" | "cpp" | "cs" | "css" | "cu" | "d" | "dart" | "go" | "h" | "hpp" | "htm" | "html" | "hxx" | "java" | "js" | "jsx" | "kt" | "md" | "mdx" | "qmd" | "rmd" | "pas" | "php" | "pp" | "proto" | "qs" | "rs" | "scala" | "sc" | "swift" | "ts" | "typ" | "xml" | "zig":
                 self.character = "//"
             case "ahk" | "asm" | "beancount" | "cl" | "clj" | "cljs" | "cljc" | "edn" | "fasl" | "ini" | "lisp" | "lsp" | "rkt" | "scm" | "ss":
                 self.character = ";"
@@ -442,11 +442,14 @@ def _process_increasing(self, lines: list, heading_character: str) -> list:
         return _newtoc
 
     # #### HTML
-
-    # every time an html page is parsed with regex, a software engineer dies
     def _process_html(self, data: str) -> list:
         _newtoc = []
-        _pattern = re.compile(r"<h(\d).*?>(?:<.*?>)?(.*?)</.*?h\d", re.MULTILINE)
+        # every time an html page is parsed with regex, a software engineer dies
+        # _pattern = re.compile(r"<h(\d).*?>(?:<.*?>)?(.*?)</.*?h\d", re.MULTILINE)
+        # https://learnbyexample.github.io/python-regex-possessive-quantifier/
+        # _pattern = re.compile(r"<h(\d)(?>.*?)>?(?:\s)*(?:<.*?>)?(?:\s)*(.*?)(?:\s)*</(.*?)h\d>", re.MULTILINE)
+        # _pattern = re.compile(r"<h(\d).*?>.*?>?(?:\s)*(?:<.*?>)?(?:\s)*(.*?)(?:\s)*</.*?h\d>", re.DOTALL)
+        _pattern = re.compile(r"<[hH](\d).*?>.*?>?(?:\s)*(?:<.*?>)?(?:\s)*(.*?)(?:\s)*</[hH]\d>", re.DOTALL)
         # _matches = _pattern.finditer(data)
         # print(sum(1 for _ in _matches))
         # for _match in _matches:
@@ -462,7 +465,8 @@ def _process_html(self, data: str) -> list:
             _heading_level = int(_match.group(1))
             # in case there are fancy tags or other elements inside the title, we remove them
             # blood for the blood god
-            _heading_text = re.sub(r"<.*?>", "", _match.group(2).strip())
+            _heading_text = re.sub(r"<.*?>", "", _match.group(2)).strip()
+            # print(f"Heading text: '{_heading_text}'")
             if self.lineNumbers:
                 # return the character number, not the line number
                 _untilCurrentMatch = _match.start(0)