From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 28 Jan 2021 14:56:24 -0500
Subject: sanitizer: escape HTML comments
Origin: https://github.com/mozilla/bleach/commit/1334134d34397966a7f7cfebd38639e9ba2c680e
Bug-Debian: https://bugs.debian.org/986251
Bug: https://bugzilla.mozilla.org/show_bug.cgi?id=1689399
Bug: https://github.com/mozilla/bleach/security/advisories/GHSA-vv2x-vrpj-qqpq
Bug-Debian-Security: https://security-tracker.debian.org/tracker/CVE-2021-23980

fixes: bug 1689399 / GHSA vv2x-vrpj-qqpq
---
 bleach/html5lib_shim.py |  1 +
 bleach/sanitizer.py     |  4 ++++
 tests/test_clean.py     | 47 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -48,6 +48,7 @@ from html5lib._inputstream import (
     HTMLInputStream,
 )  # noqa: E402 module level import not at top of file
 from html5lib.serializer import (
+    escape,
     HTMLSerializer,
 )  # noqa: E402 module level import not at top of file
 from html5lib._tokenizer import (
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -376,6 +376,10 @@ class BleachSanitizerFilter(html5lib_shi
 
         elif token_type == "Comment":
             if not self.strip_html_comments:
+                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
+                token["data"] = html5lib_shim.escape(
+                    token["data"], entities={'"': "&quot;", "'": "&#x27;"}
+                )
                 return token
             else:
                 return None
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -766,6 +766,53 @@ def test_namespace_rc_data_element_strip
     )
 
 
+@pytest.mark.parametrize(
+    "namespace_tag, end_tag, data, expected",
+    [
+        (
+            "math",
+            "p",
+            "<math><style><!--</style><img src/onerror=alert(1)>",
+            "<math><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></math>",
+        ),
+        (
+            "math",
+            "br",
+            "<math></br><style><!--</style><img src/onerror=alert(1)>",
+            "<math><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></math>",
+        ),
+        (
+            "svg",
+            "p",
+            "<svg></p><style><!--</style><img src/onerror=alert(1)>",
+            "<svg><p></p><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></svg>",
+        ),
+        (
+            "svg",
+            "br",
+            "<svg></br><style><!--</style><img src/onerror=alert(1)>",
+            "<svg><br><style><!--&lt;/style&gt;&lt;img src/onerror=alert(1)&gt;--></style></svg>",
+        ),
+    ],
+)
+def test_html_comments_escaped(namespace_tag, end_tag, data, expected):
+    # refs: bug 1689399 / GHSA-vv2x-vrpj-qqpq
+    #
+    # p and br can be just an end tag (e.g. </p> == <p></p>)
+    #
+    # In browsers:
+    #
+    # * img and other tags break out of the svg or math namespace (e.g. <svg><img></svg> == <svg><img></svg>)
+    # * style does not (e.g. <svg><style></svg> == <svg><style></style></svg>)
+    # * the breaking tag ejects trailing elements (e.g. <svg><img><style></style></svg> == <svg></svg><img><style></style>)
+    #
+    # the ejected elements can trigger XSS
+    assert (
+        clean(data, tags=[namespace_tag, end_tag, "style"], strip_comments=False)
+        == expected
+    )
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory