Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abbr Extension: Definition Sorting and Glossary storage #1467

Merged
merged 7 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,24 @@ better reflects what it is. `AbbrPreprocessor` has been deprecated.

A call to `Markdown.reset()` now clears all previously defined abbreviations.

Abbreviations are now sorted by length before executing `AbbrTreeprocessor`
to ensure that multi-word abbreviations are implemented even if an abbreviation
exists for one of those component words. (#1465)

Empty abbreviations are now skipped by `AbbrTreeprocessor`. This avoids applying
waylan marked this conversation as resolved.
Show resolved Hide resolved
abbr tags to text without a title value. This also allows disabling an
abbreviation, which may be useful for documents that uses two terms with
identical abbreviations.

Added an optional `glossary` configuration option to the abbreviations extension.
This provides a simple and efficient way to apply abbreviations to every page.

Added an optional `use_last_abbr` configuration option to the abbreviations
extension. Default (`True`) maintains the existing behavior. `False` causes
the extension to only use the first instance of an abbreviation, rather than
the last.


### Fixed

* Fixed links to source code on GitHub from the documentation (#1453).
Expand Down
20 changes: 19 additions & 1 deletion docs/extensions/abbreviations.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
title: Abbreviations Extension

ABBR

*[ABBR]: Abbreviation
*[ABBR]: Override Ignored

Abbreviations
=============

Expand Down Expand Up @@ -46,7 +51,20 @@ Usage
See [Extensions](index.md) for general extension usage. Use `abbr` as the name
of the extension.

This extension does not accept any special configuration options.
The following options are provided to configure the output:

* **`use_last_abbr`**:
waylan marked this conversation as resolved.
Show resolved Hide resolved
`True` to use the last instance of an abbreviation, rather than the first instance.

This is useful when auto-appending glossary files to pages while still wanting the page's
abbreviations to take precedence. Not recommended for use with the `glossary` option.

* **`glossary`**:
Path to a Markdown file containing abbreviations to be applied to every page.

The abbreviations from this file will be the default abbreviations applied to every page with
abbreviations defined on the page taking precedence (unless also using `use_last_abbr`). The
glossary syntax should use the same Markdown syntax described on this page.

A trivial example:

Expand Down
45 changes: 36 additions & 9 deletions markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from __future__ import annotations

from . import Extension
from ..util import parseBoolValue
nbanyan marked this conversation as resolved.
Show resolved Hide resolved
from ..blockprocessors import BlockProcessor
from ..inlinepatterns import InlineProcessor
from ..treeprocessors import Treeprocessor
Expand All @@ -41,18 +42,41 @@ class AbbrExtension(Extension):

def __init__(self, **kwargs):
""" Initiate Extension and set up configs. """
self.config = {
'glossary': [
{},
'A dictionary where the `key` is the abbreviation and the `value` is the definition.'
"Default: `{}`"
],
}
""" Default configuration options. """
super().__init__(**kwargs)
self.abbrs = {}
self.glossary = {}

def reset(self):
""" Clear all previously defined abbreviations. """
self.abbrs.clear()
if (self.glossary):
self.abbrs.update(self.glossary)

def reset_glossary(self):
""" Clear all abbreviations from the glossary. """
self.glossary.clear()

def load_glossary(self, dictionary : dict[str, str]):
"""Adds `dictionary` to our glossary. Any abbreviations that already exist will be overwritten."""
if dictionary:
self.glossary = {**dictionary, **self.glossary}

def extendMarkdown(self, md):
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
if (self.config['glossary'][0]):
self.load_glossary(self.config['glossary'][0])
self.abbrs.update(self.glossary)
md.registerExtension(self)
md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs, self.getConfigs()), 'abbr', 16)
nbanyan marked this conversation as resolved.
Show resolved Hide resolved


class AbbrTreeprocessor(Treeprocessor):
Expand All @@ -69,13 +93,14 @@ def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -
self.iter_element(child, el)
if text := el.text:
for m in reversed(list(self.RE.finditer(text))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
if self.abbrs[m.group(0)]:
waylan marked this conversation as resolved.
Show resolved Hide resolved
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
el.text = text
if parent and el.tail:
if parent is not None and el.tail:
tail = el.tail
index = list(parent).index(el) + 1
for m in reversed(list(self.RE.finditer(tail))):
Expand All @@ -92,7 +117,9 @@ def run(self, root: etree.Element) -> etree.Element | None:
# No abbreviations defined. Skip running processor.
return
# Build and compile regex
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
abbr_list = list(self.abbrs.keys())
abbr_list.sort(key=len, reverse=True)
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in abbr_list) })\\b")
# Step through tree and modify on matches
self.iter_element(root)

Expand All @@ -102,7 +129,7 @@ class AbbrBlockprocessor(BlockProcessor):

RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)

def __init__(self, parser: BlockParser, abbrs: dict):
def __init__(self, parser: BlockParser, abbrs: dict, config: dict):
nbanyan marked this conversation as resolved.
Show resolved Hide resolved
self.abbrs: dict = abbrs
super().__init__(parser)

Expand Down
102 changes: 97 additions & 5 deletions tests/test_syntax/extensions/test_abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
License: BSD (see LICENSE.md for details).
"""

import os
from markdown.test_tools import TestCase
from markdown import Markdown
from markdown.extensions.abbr import AbbrExtension
Expand Down Expand Up @@ -119,21 +120,67 @@ def test_abbr_multiple_nested(self):
)
)

def test_abbr_override(self):
def test_abbr_glossary(self):
nbanyan marked this conversation as resolved.
Show resolved Hide resolved

glossary = {
"ABBR" : "Abbreviation",
"abbr" : "Abbreviation",
"HTML" : "Hyper Text Markup Language",
"W3C" : "World Wide Web Consortium"
}

self.assertMarkdownRenders(
self.dedent(
"""
ABBR
abbr

*[ABBR]: Ignored
*[ABBR]: The override
HTML
W3C
"""
),
self.dedent(
"""
<p><abbr title="The override">ABBR</abbr></p>
<p><abbr title="Abbreviation">ABBR</abbr>
<abbr title="Abbreviation">abbr</abbr></p>
<p><abbr title="Hyper Text Markup Language">HTML</abbr>
<abbr title="World Wide Web Consortium">W3C</abbr></p>
"""
)
),
extensions=[AbbrExtension(glossary=glossary)]
)

def test_abbr_glossary_2(self):

glossary = {
"ABBR" : "Abbreviation",
"abbr" : "Abbreviation",
"HTML" : "Hyper Text Markup Language",
"W3C" : "World Wide Web Consortium"
}

glossary_2 = {
"ABBR" : "New Abbreviation"
}

abbr_ext = AbbrExtension(glossary=glossary)
abbr_ext.load_glossary(glossary_2)

self.assertMarkdownRenders(
self.dedent(
"""
ABBR abbr

HTML W3C
"""
),
self.dedent(
"""
<p><abbr title="New Abbreviation">ABBR</abbr> <abbr title="Abbreviation">abbr</abbr></p>
<p><abbr title="Hyper Text Markup Language">HTML</abbr> <abbr title="World Wide Web Consortium">W3C</abbr></p>
"""
),
extensions=[abbr_ext]
)

def test_abbr_nested(self):
Expand Down Expand Up @@ -383,6 +430,47 @@ def test_abbr_with_attr_list(self):
extensions=['abbr', 'attr_list']
)

def test_abbr_superset_vs_subset(self):
self.assertMarkdownRenders(
self.dedent(
"""
abbr, SS, and abbr-SS should have different definitions.

*[abbr]: Abbreviation Definition
*[abbr-SS]: Abbreviation Superset Definition
*[SS]: Superset Definition
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation Definition">abbr</abbr>, """
+ """<abbr title="Superset Definition">SS</abbr>, """
+ """and <abbr title="Abbreviation Superset Definition">abbr-SS</abbr> """
+ """should have different definitions.</p>
"""
)
)

def test_abbr_empty(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]: Abbreviation Definition

abbr

*[abbr]:

Testing document text.
"""
),
self.dedent(
"""
<p>abbr</p>\n<p>Testing document text.</p>
"""
)
)
waylan marked this conversation as resolved.
Show resolved Hide resolved

def test_abbr_reset(self):
ext = AbbrExtension()
md = Markdown(extensions=[ext])
Expand All @@ -394,3 +482,7 @@ def test_abbr_reset(self):
self.assertEqual(ext.abbrs, {})
md.convert('*[foo]: Foo Definition')
self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'})

import unittest
if __name__ == '__main__':
unittest.main()
Loading