diff --git a/README.md b/README.md
index 98027dd..2815dba 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ The add-on's HTML processing is highly configurable. All options can be accessed
## Platform Support
-HTML processing is provided by the Bleach library on all platforms. Additionally, the add-on also utilizes the [`htmllaundry` library](https://github.com/wichert/htmllaundry) which can improve the cleaning results under under some circumstances.
+HTML processing is provided by the Bleach library on all platforms. The add-on can also be configured to use the [`htmllaundry` library](https://github.com/wichert/htmllaundry) which can improve the cleaning results under under some circumstances.
`htmllaundry` depends on `lxml` which Anki unfortunately does not ship with. In contrast to the other libraries included in this add-on, `lxml` cannot be easily be packaged for all platforms because it needs to be compiled. For that reason `htmllaundry` support is only available on Windows and Linux right now.
@@ -24,6 +24,10 @@ HTML processing is provided by the Bleach library on all platforms. Additionally
*Cloze Overlapper* is *Copyright © 2016-2017 [Aristotelis P.](https://github.com/Glutanimate)*
+This add-on was developed on a commission by a fellow Anki user. All credit for the original idea goes to them.
+
+I'm always happy for new add-on commissions. If you'd like to hire my services to work an add-on or new feature, please feel free to reach out to me through *ankiglutanimate [αt] gmail . com*.
+
Licensed under the [GNU AGPL v3](https://www.gnu.org/licenses/agpl.html).
This add-on would not not have been possible without the following open-source libraries:
diff --git a/html_cleaner/main.py b/html_cleaner/main.py
index 11f2998..40a98e0 100644
--- a/html_cleaner/main.py
+++ b/html_cleaner/main.py
@@ -46,8 +46,12 @@
keep_styles = ["color", "background", "font-weight", "font-family",
"font-style", "font-size", "text-decoration", "margin-left"]
+# Whether or not to also process HTML with htmllaundry (if available)
+use_html_laundry = False
+
### USER CONFIGURATION END ###
+
import sys
import os
import re
@@ -57,8 +61,8 @@
sys.path.insert(0, os.path.dirname(__file__))
import bleach
-from htmllaundry import cleaners, sanitize
-# Htmllaundry depends on lxml which we cannot ship with this add-on
+
+# Htmllaundry depends on lxml which we cannot ship on all platforms
# If we can't import htmllaundry we will skip using it further down below
try:
from htmllaundry import cleaners, sanitize
@@ -66,6 +70,7 @@
except ImportError:
LAUNDROMAT = False
+
from aqt.qt import *
from aqt.editor import Editor
from anki.hooks import wrap
@@ -74,28 +79,29 @@
# insert linebreak after regex match
brtags = (r"((div|p|br|li|ul|ol|blockquote|tr|"
- "table|thead|tfoot|tbody|h[1-9]|)>)([^\n])")
+ "table|thead|tfoot|tbody|h[1-9])>|
)([^\n])")
def laundryHtml(html):
"""Clean using htmllaundry/lxml"""
# docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html
+
cleaner = cleaners.LaundryCleaner(
allow_tags = keep_tags,
safe_attrs = keep_attrs,
- page_structure = False,
- remove_unknown_tags = False,
- safe_attrs_only = True,
- add_nofollow = False,
+ processing_instructions = True,
+ meta = True,
scripts = True,
- javascript = True,
comments = True,
- style = True,
+ javascript = True,
+ annoying_tags = True,
+ page_structure=False,
+ remove_unknown_tags=False,
+ safe_attrs_only = False,
+ add_nofollow = False,
+ style = False,
links = False,
- meta = False,
- processing_instructions = True,
- frames = False,
- annoying_tags = True)
+ frames = False)
return sanitize(html, cleaner)
@@ -103,6 +109,7 @@ def laundryHtml(html):
def bleachHtml(html):
"""Clean using bleach/html5lib"""
# docs: https://bleach.readthedocs.io/en/latest/clean.html
+
cleaned = bleach.clean(html,
tags = keep_tags,
attributes = keep_attrs,
@@ -115,16 +122,23 @@ def bleachHtml(html):
def cleanHtml(html):
"""Clean HTML with cleaners and custom regexes"""
html = html.replace("\n", " ")
- if LAUNDROMAT:
+ # both bleach and htmllaundry eat "
"...
+ html = html.replace("
", "
")
+
+ if use_html_laundry and LAUNDROMAT:
+ # lxml.clean will munch
tags for some reason, even though
+ # they're whitelisted. This is an ugly workaround, but it works.
+ html = html.replace("
", "|||LBR|||").replace("", "|||LBR|||")
html = laundryHtml(html)
- cleaned = bleachHtml(html)
+ html = html.replace("|||LBR|||", "
")
+ html = bleachHtml(html)
# remove empty style attributes, try to pretty-format tags
- cleaned = cleaned.replace('