diff --git a/README.md b/README.md index 98027dd..2815dba 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The add-on's HTML processing is highly configurable. All options can be accessed ## Platform Support -HTML processing is provided by the Bleach library on all platforms. Additionally, the add-on also utilizes the [`htmllaundry` library](https://github.com/wichert/htmllaundry) which can improve the cleaning results under under some circumstances. +HTML processing is provided by the Bleach library on all platforms. The add-on can also be configured to use the [`htmllaundry` library](https://github.com/wichert/htmllaundry) which can improve the cleaning results under under some circumstances. `htmllaundry` depends on `lxml` which Anki unfortunately does not ship with. In contrast to the other libraries included in this add-on, `lxml` cannot be easily be packaged for all platforms because it needs to be compiled. For that reason `htmllaundry` support is only available on Windows and Linux right now. @@ -24,6 +24,10 @@ HTML processing is provided by the Bleach library on all platforms. Additionally *Cloze Overlapper* is *Copyright © 2016-2017 [Aristotelis P.](https://github.com/Glutanimate)* +This add-on was developed on a commission by a fellow Anki user. All credit for the original idea goes to them. + +I'm always happy for new add-on commissions. If you'd like to hire my services to work an add-on or new feature, please feel free to reach out to me through *ankiglutanimate [αt] gmail . com*. + Licensed under the [GNU AGPL v3](https://www.gnu.org/licenses/agpl.html). This add-on would not not have been possible without the following open-source libraries: diff --git a/html_cleaner/main.py b/html_cleaner/main.py index 11f2998..40a98e0 100644 --- a/html_cleaner/main.py +++ b/html_cleaner/main.py @@ -46,8 +46,12 @@ keep_styles = ["color", "background", "font-weight", "font-family", "font-style", "font-size", "text-decoration", "margin-left"] +# Whether or not to also process HTML with htmllaundry (if available) +use_html_laundry = False + ### USER CONFIGURATION END ### + import sys import os import re @@ -57,8 +61,8 @@ sys.path.insert(0, os.path.dirname(__file__)) import bleach -from htmllaundry import cleaners, sanitize -# Htmllaundry depends on lxml which we cannot ship with this add-on + +# Htmllaundry depends on lxml which we cannot ship on all platforms # If we can't import htmllaundry we will skip using it further down below try: from htmllaundry import cleaners, sanitize @@ -66,6 +70,7 @@ except ImportError: LAUNDROMAT = False + from aqt.qt import * from aqt.editor import Editor from anki.hooks import wrap @@ -74,28 +79,29 @@ # insert linebreak after regex match brtags = (r"()([^\n])") + "table|thead|tfoot|tbody|h[1-9])>|
)([^\n])") def laundryHtml(html): """Clean using htmllaundry/lxml""" # docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html + cleaner = cleaners.LaundryCleaner( allow_tags = keep_tags, safe_attrs = keep_attrs, - page_structure = False, - remove_unknown_tags = False, - safe_attrs_only = True, - add_nofollow = False, + processing_instructions = True, + meta = True, scripts = True, - javascript = True, comments = True, - style = True, + javascript = True, + annoying_tags = True, + page_structure=False, + remove_unknown_tags=False, + safe_attrs_only = False, + add_nofollow = False, + style = False, links = False, - meta = False, - processing_instructions = True, - frames = False, - annoying_tags = True) + frames = False) return sanitize(html, cleaner) @@ -103,6 +109,7 @@ def laundryHtml(html): def bleachHtml(html): """Clean using bleach/html5lib""" # docs: https://bleach.readthedocs.io/en/latest/clean.html + cleaned = bleach.clean(html, tags = keep_tags, attributes = keep_attrs, @@ -115,16 +122,23 @@ def bleachHtml(html): def cleanHtml(html): """Clean HTML with cleaners and custom regexes""" html = html.replace("\n", " ") - if LAUNDROMAT: + # both bleach and htmllaundry eat "
"... + html = html.replace("
", "
") + + if use_html_laundry and LAUNDROMAT: + # lxml.clean will munch
tags for some reason, even though + # they're whitelisted. This is an ugly workaround, but it works. + html = html.replace("
", "|||LBR|||").replace("
", "|||LBR|||") html = laundryHtml(html) - cleaned = bleachHtml(html) + html = html.replace("|||LBR|||", "
") + html = bleachHtml(html) # remove empty style attributes, try to pretty-format tags - cleaned = cleaned.replace('

', '
') - cleaned = cleaned.replace(' style=""', '').replace("\n", "") - cleaned = re.sub(brtags, r"\1\n\3", cleaned) + html = html.replace('

', '
') + html = html.replace(' style=""', '') + html = re.sub(brtags, r"\1\n\3", html) - return cleaned + return html def onHtmlClean(self): @@ -151,7 +165,7 @@ def onHtmlClean(self): def onFieldUndo(self): - """Executued on undo toggle""" + """Executed on undo toggle""" if not hasattr(self, "_fieldUndo") or not self._fieldUndo: return n, html = self._fieldUndo diff --git a/tools/test.py b/tools/test.py index 04c9fbf..ece0e12 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,7 +8,7 @@ import bleach -# Htmllaundry depends on lxml which we cannot ship with this add-on +# Htmllaundry depends on lxml which we cannot ship on all platforms # for that reason we have to check if we can import htmllaundry. # If we can't we will skip using htmllaundry further down below try: @@ -17,11 +17,34 @@ except ImportError: LAUNDROMAT = False - html = u"""
EXAMPLE 1
LOREM DOLOR SIT AMET
EXAMPLE 2
-Lorem ipsum doler sit amet Lorem ipsum ipsum dolore sit amet.""" +Lorem ipsum doler sit amet Lorem ipsum ipsum dolore sit amet. +
EXAMPLE 3
+Überschrift

+
Das ist der erste Beispielsatz an +dieser Stelle.
+

+Das ist der zweite +Beispielsatz

+

+

+

+
Das ist ein +dritter Satz. 
+

+

+
""" # Html tags to preserve @@ -31,6 +54,7 @@ 'dft', 'br', 'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'div', 'u', 'i'] + # Tag attributes to preserve keep_attrs = [ 'style', 'rev', 'prompt', 'color', 'colspan', 'usemap', 'cols', 'accept', 'datetime', 'char', @@ -45,57 +69,75 @@ 'value', 'longdesc', 'headers', 'vspace', 'noshade', 'coords', 'width', 'maxlength', 'cellpadding', 'title', 'dir', 'tabindex'] + # Styles to preserve in the style attribute keep_styles = ["margin-left"] + # insert linebreak after regex match brtags = (r"()([^\n])") + "table|thead|tfoot|tbody|h[1-9])>|
)([^\n])") + +use_html_laundry = False def laundryHtml(html): + """Clean using htmllaundry/lxml""" + # docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html + cleaner = cleaners.LaundryCleaner( - page_structure=False, - remove_unknown_tags=False, allow_tags = keep_tags, safe_attrs = keep_attrs, - safe_attrs_only=True, - add_nofollow=False, - scripts=True, - javascript=True, - comments=True, - style=True, - links=False, - meta=False, - processing_instructions=True, - frames=False, - annoying_tags=True) + processing_instructions = True, + meta = True, + scripts = True, + comments = True, + javascript = True, + annoying_tags = True, + page_structure=False, + remove_unknown_tags=False, + safe_attrs_only = False, + add_nofollow = False, + style = False, + links = False, + frames = False) return sanitize(html, cleaner) def bleachHtml(html): + """Clean using bleach/html5lib""" + # docs: https://bleach.readthedocs.io/en/latest/clean.html + cleaned = bleach.clean(html, tags = keep_tags, attributes = keep_attrs, styles = keep_styles, - strip = True - ) + strip = True) + return cleaned def cleanHtml(html): + """Clean HTML with cleaners and custom regexes""" html = html.replace("\n", " ") - if LAUNDROMAT: + # both bleach and htmllaundry eat "
"... + html = html.replace("
", "
") + + if use_html_laundry and LAUNDROMAT: + # lxml.clean will munch
tags for some reason, even though + # they're whitelisted. This is an ugly workaround, but it works. + html = html.replace("
", "|||LBR|||").replace("
", "|||LBR|||") html = laundryHtml(html) - cleaned = bleachHtml(html) + html = html.replace("|||LBR|||", "
") + html = bleachHtml(html) # remove empty style attributes, try to pretty-format tags - cleaned = cleaned.replace('

', '
') - cleaned = cleaned.replace(' style=""', '').replace("\n", "") - cleaned = re.sub(brtags, r"\1\n\3", cleaned) + html = html.replace('

', '
') + html = html.replace(' style=""', '') + html = re.sub(brtags, r"\1\n\3", html) - return cleaned + return html cleaned = cleanHtml(html)