diff --git a/docs/changelog.md b/docs/changelog.md index 86b3b5fd..4c308899 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased] +### Changed + +#### Refactor `abbr` Extension + +A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated +`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, +avoiding a conflict between the two extensions (#1460). + +The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which +better reflects what it is. `AbbrPreprocessor` has been deprecated. + +A call to `Markdown.reset()` now clears all previously defined abbreviations. + ### Fixed * Fixed links to source code on GitHub from the documentation (#1453). diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index 1c7185b2..1f81cab3 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -25,31 +25,94 @@ from . import Extension from ..blockprocessors import BlockProcessor from ..inlinepatterns import InlineProcessor -from ..util import AtomicString +from ..treeprocessors import Treeprocessor +from ..util import AtomicString, deprecated +from typing import TYPE_CHECKING import re import xml.etree.ElementTree as etree +if TYPE_CHECKING: # pragma: no cover + from .. import Markdown + from ..blockparsers import BlockParser + class AbbrExtension(Extension): """ Abbreviation Extension for Python-Markdown. """ - def extendMarkdown(self, md): - """ Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """ - md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16) + def __init__(self, **kwargs): + """ Initiate Extension and set up configs. """ + super().__init__(**kwargs) + self.abbrs = {} + def reset(self): + """ Clear all previously defined abbreviations. """ + self.abbrs.clear() -class AbbrPreprocessor(BlockProcessor): - """ Abbreviation Preprocessor - parse text for abbr references. """ + def extendMarkdown(self, md): + """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """ + md.registerExtension(self) + md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7) + md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16) + + +class AbbrTreeprocessor(Treeprocessor): + """ Replace abbreviation text with `` elements. """ + + def __init__(self, md: Markdown | None = None, abbrs: dict | None = None): + self.abbrs: dict = abbrs if abbrs is not None else {} + self.RE: re.RegexObject | None = None + super().__init__(md) + + def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None: + ''' Recursively iterate over elements, run regex on text and wrap matches in `abbr` tags. ''' + for child in reversed(el): + self.iter_element(child, el) + if text := el.text: + for m in reversed(list(self.RE.finditer(text))): + abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]}) + abbr.text = AtomicString(m.group(0)) + abbr.tail = text[m.end():] + el.insert(0, abbr) + text = text[:m.start()] + el.text = text + if parent and el.tail: + tail = el.tail + index = list(parent).index(el) + 1 + for m in reversed(list(self.RE.finditer(tail))): + abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]}) + abbr.text = AtomicString(m.group(0)) + abbr.tail = tail[m.end():] + parent.insert(index, abbr) + tail = tail[:m.start()] + el.tail = tail + + def run(self, root: etree.Element) -> etree.Element | None: + ''' Step through tree to find known abbreviations. ''' + if not self.abbrs: + # No abbreviations defined. Skip running processor. + return + # Build and compile regex + self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b") + # Step through tree and modify on matches + self.iter_element(root) + + +class AbbrBlockprocessor(BlockProcessor): + """ Parse text for abbreviation references. """ RE = re.compile(r'^[*]\[(?P[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P.*)$', re.MULTILINE) + def __init__(self, parser: BlockParser, abbrs: dict): + self.abbrs: dict = abbrs + super().__init__(parser) + def test(self, parent: etree.Element, block: str) -> bool: return True def run(self, parent: etree.Element, blocks: list[str]) -> bool: """ - Find and remove all Abbreviation references from the text. - Each reference is set as a new `AbbrPattern` in the markdown instance. + Find and remove all abbreviation references from the text. + Each reference is added to the abbreviation collection. """ block = blocks.pop(0) @@ -57,9 +120,7 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if m: abbr = m.group('abbr').strip() title = m.group('title').strip() - self.parser.md.inlinePatterns.register( - AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2 - ) + self.abbrs[abbr] = title if block[m.end():].strip(): # Add any content after match back to blocks as separate block blocks.insert(0, block[m.end():].lstrip('\n')) @@ -71,11 +132,11 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: blocks.insert(0, block) return False - def _generate_pattern(self, text: str) -> str: - """ Given a string, returns a regex pattern to match that string. """ - return f"(?P<abbr>\\b{ re.escape(text) }\\b)" + +AbbrPreprocessor = deprecated("This class has been renamed to `AbbrBlockprocessor`.")(AbbrBlockprocessor) +@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.") class AbbrInlineProcessor(InlineProcessor): """ Abbreviation inline pattern. """ diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py index e11e8d30..012e5718 100644 --- a/tests/test_syntax/extensions/test_abbr.py +++ b/tests/test_syntax/extensions/test_abbr.py @@ -21,6 +21,8 @@ """ from markdown.test_tools import TestCase +from markdown import Markdown +from markdown.extensions.abbr import AbbrExtension class TestAbbr(TestCase): @@ -60,7 +62,7 @@ def test_abbr_lower(self): ) ) - def test_abbr_multiple(self): + def test_abbr_multiple_in_text(self): self.assertMarkdownRenders( self.dedent( """ @@ -79,6 +81,44 @@ def test_abbr_multiple(self): ) ) + def test_abbr_multiple_in_tail(self): + self.assertMarkdownRenders( + self.dedent( + """ + *The* HTML specification + is maintained by the W3C. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification + is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p> + """ + ) + ) + + def test_abbr_multiple_nested(self): + self.assertMarkdownRenders( + self.dedent( + """ + The *HTML* specification + is maintained by the *W3C*. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification + is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p> + """ + ) + ) + def test_abbr_override(self): self.assertMarkdownRenders( self.dedent( @@ -325,3 +365,32 @@ def test_abbr_bracket(self): """ ) ) + + def test_abbr_with_attr_list(self): + self.assertMarkdownRenders( + self.dedent( + """ + *[abbr]: Abbreviation Definition + + ![Image with abbr in title](abbr.png){title="Image with abbr in title"} + """ + ), + self.dedent( + """ + <p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p> + """ + ), + extensions=['abbr', 'attr_list'] + ) + + def test_abbr_reset(self): + ext = AbbrExtension() + md = Markdown(extensions=[ext]) + md.convert('*[abbr]: Abbreviation Definition') + self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition'}) + md.convert('*[ABBR]: Capitalised Abbreviation') + self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition', 'ABBR': 'Capitalised Abbreviation'}) + md.reset() + self.assertEqual(ext.abbrs, {}) + md.convert('*[foo]: Foo Definition') + self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'})