From c03724b550ab4735d39a8d6cd159631550e1e73c Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 24 Apr 2024 11:39:10 -0400 Subject: [PATCH 1/5] Refactor `abbr` Extension A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions. Fixes #1460. --- docs/changelog.md | 8 +++ markdown/extensions/abbr.py | 75 ++++++++++++++++++----- tests/test_syntax/extensions/test_abbr.py | 57 ++++++++++++++++- 3 files changed, 124 insertions(+), 16 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 86b3b5fd..25476435 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased] +### Changed + +#### Refactor `abbr` Extension + +A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated +`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, +avoiding a conflict between the two extensions (#1460). + ### Fixed * Fixed links to source code on GitHub from the documentation (#1453). diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index 1c7185b2..6dbfcf51 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -25,7 +25,8 @@ from . import Extension from ..blockprocessors import BlockProcessor from ..inlinepatterns import InlineProcessor -from ..util import AtomicString +from ..treeprocessors import Treeprocessor +from ..util import AtomicString, deprecated import re import xml.etree.ElementTree as etree @@ -34,22 +35,71 @@ class AbbrExtension(Extension): """ Abbreviation Extension for Python-Markdown. """ def extendMarkdown(self, md): - """ Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """ - md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16) - - -class AbbrPreprocessor(BlockProcessor): - """ Abbreviation Preprocessor - parse text for abbr references. """ + """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """ + treeprocessor = AbbrTreeprocessor(md) + md.treeprocessors.register(treeprocessor, 'abbr', 7) + md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, treeprocessor.abbrs), 'abbr', 16) + + +class AbbrTreeprocessor(Treeprocessor): + """ Replace abbr text with `` elements. """ + + def __init__(self, md: Markdown | None=None): + self.abbrs = {} + self.RE = None + super().__init__(md) + + def iter_element(self, el, parent=None): + ''' Resursively iterate over elements, run regex on text and wrap matches in `abbr` tags. ''' + for child in reversed(el): + self.iter_element(child, el) + if text := el.text: + for m in reversed(list(self.RE.finditer(text))): + abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]}) + abbr.text = AtomicString(m.group(0)) + abbr.tail = text[m.end():] + el.insert(0, abbr) + text = text[:m.start()] + el.text = text + if parent and el.tail: + tail = el.tail + index = list(parent).index(el) + 1 + for m in reversed(list(self.RE.finditer(tail))): + abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]}) + abbr.text = AtomicString(m.group(0)) + abbr.tail = tail[m.end():] + parent.insert(index, abbr) + tail = tail[:m.start()] + el.tail = tail + + def run(self, root: etree.Element) -> etree.Element | None: + ''' Step through tree to find known abbreviations. ''' + if not self.abbrs: + # No abbrs defined. Skip running processor. + return + # Build and compile regex + self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs.keys()) })\\b") + # Step through tree and modify on matches + self.iter_element(root) + return + + +class AbbrBlockprocessor(BlockProcessor): + """ Abbreviation Blockprocessor - parse text for abbr references. """ RE = re.compile(r'^[*]\[(?P[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P.*)$', re.MULTILINE) + def __init__(self, parser, abbrs): + self.abbrs = abbrs + super().__init__(parser) + def test(self, parent: etree.Element, block: str) -> bool: return True def run(self, parent: etree.Element, blocks: list[str]) -> bool: """ Find and remove all Abbreviation references from the text. - Each reference is set as a new `AbbrPattern` in the markdown instance. + Each reference is added to the abbrs collection. """ block = blocks.pop(0) @@ -57,9 +107,7 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if m: abbr = m.group('abbr').strip() title = m.group('title').strip() - self.parser.md.inlinePatterns.register( - AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2 - ) + self.abbrs[abbr] = title if block[m.end():].strip(): # Add any content after match back to blocks as separate block blocks.insert(0, block[m.end():].lstrip('\n')) @@ -71,11 +119,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: blocks.insert(0, block) return False - def _generate_pattern(self, text: str) -> str: - """ Given a string, returns a regex pattern to match that string. """ - return f"(?P<abbr>\\b{ re.escape(text) }\\b)" - +@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.") class AbbrInlineProcessor(InlineProcessor): """ Abbreviation inline pattern. """ diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py index e11e8d30..61021368 100644 --- a/tests/test_syntax/extensions/test_abbr.py +++ b/tests/test_syntax/extensions/test_abbr.py @@ -60,7 +60,7 @@ def test_abbr_lower(self): ) ) - def test_abbr_multiple(self): + def test_abbr_multiple_in_text(self): self.assertMarkdownRenders( self.dedent( """ @@ -79,6 +79,44 @@ def test_abbr_multiple(self): ) ) + def test_abbr_multiple_in_tail(self): + self.assertMarkdownRenders( + self.dedent( + """ + *The* HTML specification + is maintained by the W3C. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification + is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p> + """ + ) + ) + + def test_abbr_multiple_nested(self): + self.assertMarkdownRenders( + self.dedent( + """ + The *HTML* specification + is maintained by the *W3C*. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification + is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p> + """ + ) + ) + def test_abbr_override(self): self.assertMarkdownRenders( self.dedent( @@ -325,3 +363,20 @@ def test_abbr_bracket(self): """ ) ) + + def test_abbr_with_attr_list(self): + self.assertMarkdownRenders( + self.dedent( + """ + *[abbr]: Abbreviation Definition + + ![Image with abbr in title](abbr.png){title="Image with abbr in title"} + """ + ), + self.dedent( + """ + <p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p> + """ + ), + extensions = ['abbr', 'attr_list'] + ) From 3110977699f4f229089f24dafcd7bd142a651318 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 24 Apr 2024 13:47:59 -0400 Subject: [PATCH 2/5] cleanup --- markdown/extensions/abbr.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index 6dbfcf51..e8b827a8 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -27,9 +27,14 @@ from ..inlinepatterns import InlineProcessor from ..treeprocessors import Treeprocessor from ..util import AtomicString, deprecated +from typing import TYPE_CHECKING import re import xml.etree.ElementTree as etree +if TYPE_CHECKING: # pragma: no cover + from .. import Markdown + from ..blockparsers import BlockParser + class AbbrExtension(Extension): """ Abbreviation Extension for Python-Markdown. """ @@ -42,15 +47,15 @@ def extendMarkdown(self, md): class AbbrTreeprocessor(Treeprocessor): - """ Replace abbr text with `<abbr>` elements. """ + """ Replace abbreviation text with `<abbr>` elements. """ - def __init__(self, md: Markdown | None=None): + def __init__(self, md: Markdown | None = None): self.abbrs = {} self.RE = None super().__init__(md) - def iter_element(self, el, parent=None): - ''' Resursively iterate over elements, run regex on text and wrap matches in `abbr` tags. ''' + def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None: + ''' Recursively iterate over elements, run regex on text and wrap matches in `abbr` tags. ''' for child in reversed(el): self.iter_element(child, el) if text := el.text: @@ -89,7 +94,7 @@ class AbbrBlockprocessor(BlockProcessor): RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE) - def __init__(self, parser, abbrs): + def __init__(self, parser: BlockParser, abbrs: dict): self.abbrs = abbrs super().__init__(parser) @@ -98,8 +103,8 @@ def test(self, parent: etree.Element, block: str) -> bool: def run(self, parent: etree.Element, blocks: list[str]) -> bool: """ - Find and remove all Abbreviation references from the text. - Each reference is added to the abbrs collection. + Find and remove all abbreviation references from the text. + Each reference is added to the abbreviation collection. """ block = blocks.pop(0) From 7d9d4d2e7307eabc0a5ebee2caa8c15ff0efed6a Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 24 Apr 2024 14:37:23 -0400 Subject: [PATCH 3/5] reset --- docs/changelog.md | 2 ++ markdown/extensions/abbr.py | 27 +++++++++++++++-------- tests/test_syntax/extensions/test_abbr.py | 16 +++++++++++++- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 25476435..0802167c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -18,6 +18,8 @@ A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions (#1460). +A call to `Markdown.reset()` now clears all previously defined abbreviations. + ### Fixed * Fixed links to source code on GitHub from the documentation (#1453). diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index e8b827a8..ef157625 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -39,19 +39,28 @@ class AbbrExtension(Extension): """ Abbreviation Extension for Python-Markdown. """ + def __init__(self, **kwargs): + """ Initiate Extension and set up configs. """ + super().__init__(**kwargs) + self.abbrs = {} + + def reset(self): + """ Clear all previously defined abbreviations. """ + self.abbrs.clear() + def extendMarkdown(self, md): """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """ - treeprocessor = AbbrTreeprocessor(md) - md.treeprocessors.register(treeprocessor, 'abbr', 7) - md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, treeprocessor.abbrs), 'abbr', 16) + md.registerExtension(self) + md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7) + md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16) class AbbrTreeprocessor(Treeprocessor): """ Replace abbreviation text with `<abbr>` elements. """ - def __init__(self, md: Markdown | None = None): - self.abbrs = {} - self.RE = None + def __init__(self, md: Markdown | None = None, abbrs: dict | None = None): + self.abbrs: dict = abbrs if abbrs is not None else {} + self.RE: re.RegexObject | None = None super().__init__(md) def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None: @@ -80,7 +89,7 @@ def iter_element(self, el: etree.Element, parent: etree.Element | None = None) - def run(self, root: etree.Element) -> etree.Element | None: ''' Step through tree to find known abbreviations. ''' if not self.abbrs: - # No abbrs defined. Skip running processor. + # No abbreviations defined. Skip running processor. return # Build and compile regex self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs.keys()) })\\b") @@ -90,12 +99,12 @@ def run(self, root: etree.Element) -> etree.Element | None: class AbbrBlockprocessor(BlockProcessor): - """ Abbreviation Blockprocessor - parse text for abbr references. """ + """ Parse text for abbreviation references. """ RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE) def __init__(self, parser: BlockParser, abbrs: dict): - self.abbrs = abbrs + self.abbrs: dict = abbrs super().__init__(parser) def test(self, parent: etree.Element, block: str) -> bool: diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py index 61021368..012e5718 100644 --- a/tests/test_syntax/extensions/test_abbr.py +++ b/tests/test_syntax/extensions/test_abbr.py @@ -21,6 +21,8 @@ """ from markdown.test_tools import TestCase +from markdown import Markdown +from markdown.extensions.abbr import AbbrExtension class TestAbbr(TestCase): @@ -378,5 +380,17 @@ def test_abbr_with_attr_list(self): <p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p> """ ), - extensions = ['abbr', 'attr_list'] + extensions=['abbr', 'attr_list'] ) + + def test_abbr_reset(self): + ext = AbbrExtension() + md = Markdown(extensions=[ext]) + md.convert('*[abbr]: Abbreviation Definition') + self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition'}) + md.convert('*[ABBR]: Capitalised Abbreviation') + self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition', 'ABBR': 'Capitalised Abbreviation'}) + md.reset() + self.assertEqual(ext.abbrs, {}) + md.convert('*[foo]: Foo Definition') + self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'}) From 3b757e09bd6b72c940ee474d410f6ce0f589bf4d Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 24 Apr 2024 15:23:56 -0400 Subject: [PATCH 4/5] Note name change AbbrPreprocessor=>AbbrBlockprocessor --- docs/changelog.md | 2 ++ markdown/extensions/abbr.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 0802167c..7a91c8ae 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -18,6 +18,8 @@ A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions (#1460). +The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which better reflects what it is. `AbbrPreprocessor` has been deprecated. + A call to `Markdown.reset()` now clears all previously defined abbreviations. ### Fixed diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index ef157625..8458ade6 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -134,6 +134,9 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: return False +AbbrPreprocessor = deprecated("This class has been renamed to `AbbrBlockprocessor`.")(AbbrBlockprocessor) + + @deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.") class AbbrInlineProcessor(InlineProcessor): """ Abbreviation inline pattern. """ From 684f05f469b96c443f748022e9cfe66ff3d046be Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Thu, 25 Apr 2024 09:41:23 -0400 Subject: [PATCH 5/5] final cleanup --- docs/changelog.md | 3 ++- markdown/extensions/abbr.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 7a91c8ae..4c308899 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -18,7 +18,8 @@ A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions (#1460). -The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which better reflects what it is. `AbbrPreprocessor` has been deprecated. +The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which +better reflects what it is. `AbbrPreprocessor` has been deprecated. A call to `Markdown.reset()` now clears all previously defined abbreviations. diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index 8458ade6..1f81cab3 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -92,10 +92,9 @@ def run(self, root: etree.Element) -> etree.Element | None: # No abbreviations defined. Skip running processor. return # Build and compile regex - self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs.keys()) })\\b") + self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b") # Step through tree and modify on matches self.iter_element(root) - return class AbbrBlockprocessor(BlockProcessor):