Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor abbr Extension #1461

Merged
merged 5 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [unreleased]

### Changed

#### Refactor `abbr` Extension

A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated
`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists,
avoiding a conflict between the two extensions (#1460).

The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which
better reflects what it is. `AbbrPreprocessor` has been deprecated.

A call to `Markdown.reset()` now clears all previously defined abbreviations.

### Fixed

* Fixed links to source code on GitHub from the documentation (#1453).
Expand Down
89 changes: 75 additions & 14 deletions markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,41 +25,102 @@
from . import Extension
from ..blockprocessors import BlockProcessor
from ..inlinepatterns import InlineProcessor
from ..util import AtomicString
from ..treeprocessors import Treeprocessor
from ..util import AtomicString, deprecated
from typing import TYPE_CHECKING
import re
import xml.etree.ElementTree as etree

if TYPE_CHECKING: # pragma: no cover
from .. import Markdown
from ..blockparsers import BlockParser


class AbbrExtension(Extension):
""" Abbreviation Extension for Python-Markdown. """

def extendMarkdown(self, md):
""" Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """
md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
def __init__(self, **kwargs):
""" Initiate Extension and set up configs. """
super().__init__(**kwargs)
self.abbrs = {}

def reset(self):
""" Clear all previously defined abbreviations. """
self.abbrs.clear()

class AbbrPreprocessor(BlockProcessor):
""" Abbreviation Preprocessor - parse text for abbr references. """
def extendMarkdown(self, md):
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
md.registerExtension(self)
md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)


class AbbrTreeprocessor(Treeprocessor):
""" Replace abbreviation text with `<abbr>` elements. """

def __init__(self, md: Markdown | None = None, abbrs: dict | None = None):
self.abbrs: dict = abbrs if abbrs is not None else {}
self.RE: re.RegexObject | None = None
super().__init__(md)

def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None:
''' Recursively iterate over elements, run regex on text and wrap matches in `abbr` tags. '''
for child in reversed(el):
self.iter_element(child, el)
if text := el.text:
for m in reversed(list(self.RE.finditer(text))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
el.text = text
if parent and el.tail:
tail = el.tail
index = list(parent).index(el) + 1
for m in reversed(list(self.RE.finditer(tail))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = tail[m.end():]
parent.insert(index, abbr)
tail = tail[:m.start()]
el.tail = tail

def run(self, root: etree.Element) -> etree.Element | None:
''' Step through tree to find known abbreviations. '''
if not self.abbrs:
# No abbreviations defined. Skip running processor.
return
# Build and compile regex
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
# Step through tree and modify on matches
self.iter_element(root)


class AbbrBlockprocessor(BlockProcessor):
""" Parse text for abbreviation references. """

RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)

def __init__(self, parser: BlockParser, abbrs: dict):
self.abbrs: dict = abbrs
super().__init__(parser)

def test(self, parent: etree.Element, block: str) -> bool:
return True

def run(self, parent: etree.Element, blocks: list[str]) -> bool:
"""
Find and remove all Abbreviation references from the text.
Each reference is set as a new `AbbrPattern` in the markdown instance.
Find and remove all abbreviation references from the text.
Each reference is added to the abbreviation collection.

"""
block = blocks.pop(0)
m = self.RE.search(block)
if m:
abbr = m.group('abbr').strip()
title = m.group('title').strip()
self.parser.md.inlinePatterns.register(
AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
)
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
Expand All @@ -71,11 +132,11 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
blocks.insert(0, block)
return False

def _generate_pattern(self, text: str) -> str:
""" Given a string, returns a regex pattern to match that string. """
return f"(?P<abbr>\\b{ re.escape(text) }\\b)"

AbbrPreprocessor = deprecated("This class has been renamed to `AbbrBlockprocessor`.")(AbbrBlockprocessor)


@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.")
class AbbrInlineProcessor(InlineProcessor):
""" Abbreviation inline pattern. """

Expand Down
71 changes: 70 additions & 1 deletion tests/test_syntax/extensions/test_abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
"""

from markdown.test_tools import TestCase
from markdown import Markdown
from markdown.extensions.abbr import AbbrExtension


class TestAbbr(TestCase):
Expand Down Expand Up @@ -60,7 +62,7 @@ def test_abbr_lower(self):
)
)

def test_abbr_multiple(self):
def test_abbr_multiple_in_text(self):
self.assertMarkdownRenders(
self.dedent(
"""
Expand All @@ -79,6 +81,44 @@ def test_abbr_multiple(self):
)
)

def test_abbr_multiple_in_tail(self):
self.assertMarkdownRenders(
self.dedent(
"""
*The* HTML specification
is maintained by the W3C.

*[HTML]: Hyper Text Markup Language
*[W3C]: World Wide Web Consortium
"""
),
self.dedent(
"""
<p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification
is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
"""
)
)

def test_abbr_multiple_nested(self):
self.assertMarkdownRenders(
self.dedent(
"""
The *HTML* specification
is maintained by the *W3C*.

*[HTML]: Hyper Text Markup Language
*[W3C]: World Wide Web Consortium
"""
),
self.dedent(
"""
<p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification
is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p>
"""
)
)

def test_abbr_override(self):
self.assertMarkdownRenders(
self.dedent(
Expand Down Expand Up @@ -325,3 +365,32 @@ def test_abbr_bracket(self):
"""
)
)

def test_abbr_with_attr_list(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]: Abbreviation Definition

![Image with abbr in title](abbr.png){title="Image with abbr in title"}
"""
),
self.dedent(
"""
<p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p>
"""
),
extensions=['abbr', 'attr_list']
)

def test_abbr_reset(self):
ext = AbbrExtension()
md = Markdown(extensions=[ext])
md.convert('*[abbr]: Abbreviation Definition')
self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition'})
md.convert('*[ABBR]: Capitalised Abbreviation')
self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition', 'ABBR': 'Capitalised Abbreviation'})
md.reset()
self.assertEqual(ext.abbrs, {})
md.convert('*[foo]: Foo Definition')
self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'})
Loading