From 1334134d34397966a7f7cfebd38639e9ba2c680e Mon Sep 17 00:00:00 2001
From: Greg Guthe
Date: Thu, 28 Jan 2021 14:56:24 -0500
Subject: [PATCH 1/2] sanitizer: escape HTML comments
fixes: bug 1689399 / GHSA vv2x-vrpj-qqpq
---
bleach/html5lib_shim.py | 1 +
bleach/sanitizer.py | 4 ++++
tests/test_clean.py | 47 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 52 insertions(+)
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index c71947ee..b886ca50 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -48,6 +48,7 @@
HTMLInputStream,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.serializer import (
+ escape,
HTMLSerializer,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib._tokenizer import (
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index bc66ad2a..0f5b7cc5 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -371,6 +371,10 @@ def sanitize_token(self, token):
elif token_type == "Comment":
if not self.strip_html_comments:
+ # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
+ token["data"] = html5lib_shim.escape(
+ token["data"], entities={'"': """, "'": "'"}
+ )
return token
else:
return None
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 1cd58df0..7c565750 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -739,6 +739,53 @@ def test_namespace_rc_data_element_strip_false(
)
+@pytest.mark.parametrize(
+ "namespace_tag, end_tag, data, expected",
+ [
+ (
+ "math",
+ "p",
+ "
",
+ "",
+ ),
+ (
+ "math",
+ "br",
+ "