diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 169c4027..dcd98a1a 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -533,7 +533,18 @@ def next_possible_entity(text): class BleachHTMLSerializer(HTMLSerializer): - """HTMLSerializer that undoes & -> & in attributes""" + """HTMLSerializer that undoes & -> & in attributes and sets + escape_rcdata to True + """ + + # per the HTMLSerializer.__init__ docstring: + # + # Whether to escape characters that need to be + # escaped within normal elements within rcdata elements such as + # style. + # + escape_rcdata = True + def escape_base_amp(self, stoken): """Escapes just bare & in HTML attribute values""" # First, undo escaping of &. We need to do this because html5lib's diff --git a/tests/test_clean.py b/tests/test_clean.py index f3c00001..cd5360b8 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -5,7 +5,7 @@ from bleach import clean from bleach.html5lib_shim import Filter from bleach.sanitizer import Cleaner - +from bleach._vendor.html5lib.constants import rcdataElements def test_clean_idempotent(): """Make sure that applying the filter twice doesn't change anything.""" @@ -787,7 +787,7 @@ def test_nonexistent_namespace(): ( raw_tag, "" % raw_tag, - "<img src=x onerror=alert(1) />" % raw_tag, + "<img src=x onerror=alert(1) />" % raw_tag, ) for raw_tag in _raw_tags ], @@ -797,6 +797,29 @@ def test_noscript_rawtag_(raw_tag, data, expected): assert clean(data, tags=["noscript", raw_tag]) == expected +@pytest.mark.parametrize( + "namespace_tag, rc_data_element_tag, data, expected", + [ + ( + namespace_tag, + rc_data_element_tag, + "<%s><%s>" % (namespace_tag, rc_data_element_tag), + "<%s><%s><img src=x onerror=alert(1)>" % (namespace_tag, rc_data_element_tag, rc_data_element_tag, namespace_tag), + ) + for namespace_tag in ["math", "svg"] + # https://dev.w3.org/html5/html-author/#rcdata-elements + # https://html.spec.whatwg.org/index.html#parsing-html-fragments + # in html5lib: 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', and 'noscript' + for rc_data_element_tag in rcdataElements + ], +) +def test_namespace_rc_data_element_strip_false(namespace_tag, rc_data_element_tag, data, expected): + # refs: bug 1621692 / GHSA-m6xf-fq7q-8743 + # + # browsers will pull the img out of the namespace and rc data tag resulting in XSS + assert clean(data, tags=[namespace_tag, rc_data_element_tag], strip=False) == expected + + def get_ids_and_tests(): """Retrieves regression tests from data/ directory