From: Ryan Grove <ryan@wonko.com>
Date: Mon, 15 Jun 2020 14:27:07 -0700
Subject: Fix sanitization bypass in HTML foreign content
Origin: https://github.com/rgrove/sanitize/commit/a11498de9e283cd457b35ee252983662f7452aa9
Bug: https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
Bug-Debian: https://bugs.debian.org/963808
Bug-Debian-Security: https://security-tracker.debian.org/tracker/CVE-2020-4054

https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m

[Salvatore Bonaccorso: Backport to 4.6.6 for context changes]
---
 README.md                      | 11 +++++++++++
 lib/sanitize/config/default.rb |  2 +-
 test/test_clean_element.rb     | 30 ++++++++++++++++++++----------
 test/test_malicious_html.rb    | 13 +++++++++++++
 4 files changed, 45 insertions(+), 11 deletions(-)

--- a/README.md
+++ b/README.md
@@ -73,6 +73,11 @@ Sanitize can sanitize the following type
 * Standalone CSS stylesheets
 * Standalone CSS properties
 
+However, please note that Sanitize _cannot_ fully sanitize the contents of
+`<math>` or `<svg>` elements, since these elements don't follow the same parsing
+rules as the rest of HTML. If this is something you need, you may want to look
+for another solution.
+
 ### HTML Fragments
 
 A fragment is a snippet of HTML that doesn't contain a root-level `<html>`
@@ -417,6 +422,12 @@ elements not in this array will be remov
 ]
 ```
 
+**Warning:** Sanitize cannot fully sanitize the contents of `<math>` or `<svg>`
+elements, since these elements don't follow the same parsing rules as the rest
+of HTML. If you add `math` or `svg` to the allowlist, you must assume that any
+content inside them will be allowed, even if that content would otherwise be
+removed by Sanitize.
+
 #### :protocols (Hash)
 
 URL protocols to allow in specific attributes. If an attribute is listed here
--- a/lib/sanitize/config/default.rb
+++ b/lib/sanitize/config/default.rb
@@ -70,7 +70,7 @@ class Sanitize
       # the specified elements (when filtered) will be removed, and the contents
       # of all other filtered elements will be left behind.
       :remove_contents => %w[
-        iframe noembed noframes noscript script style
+        iframe math noembed noframes noscript plaintext script style svg xmp
       ],
 
       # Transformers allow you to filter or alter nodes using custom logic. See
--- a/test/test_clean_element.rb
+++ b/test/test_clean_element.rb
@@ -192,21 +192,16 @@ describe 'Sanitize::Transformers::CleanE
         .must_equal ''
     end
 
-    it 'should escape the content of removed `plaintext` elements' do
-      Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
-        .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
-    end
-
-    it 'should escape the content of removed `xmp` elements' do
-      Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
-        .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
-    end
-
     it 'should not preserve the content of removed `iframe` elements' do
       Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
         .must_equal ''
     end
 
+    it 'should not preserve the content of removed `math` elements' do
+      Sanitize.fragment('<math>hello! <script>alert(0)</script></math>')
+        .must_equal ''
+    end
+
     it 'should not preserve the content of removed `noembed` elements' do
       Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
         .must_equal ''
@@ -222,6 +217,11 @@ describe 'Sanitize::Transformers::CleanE
         .must_equal ''
     end
 
+    it 'should not preserve the content of removed `plaintext` elements' do
+      Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
+        .must_equal ''
+    end
+
     it 'should not preserve the content of removed `script` elements' do
       Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
         .must_equal ''
@@ -232,6 +232,16 @@ describe 'Sanitize::Transformers::CleanE
         .must_equal ''
     end
 
+    it 'should not preserve the content of removed `svg` elements' do
+      Sanitize.fragment('<svg>hello! <script>alert(0)</script></svg>')
+        .must_equal ''
+    end
+
+    it 'should not preserve the content of removed `xmp` elements' do
+      Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
+        .must_equal ''
+    end
+
     strings.each do |name, data|
       it "should clean #{name} HTML" do
         Sanitize.fragment(data[:html]).must_equal(data[:default])
--- a/test/test_malicious_html.rb
+++ b/test/test_malicious_html.rb
@@ -189,4 +189,17 @@ describe 'Malicious HTML' do
       end
     end
   end
+
+  # https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
+  describe 'foreign content bypass in relaxed config' do
+    it 'prevents a sanitization bypass via carefully crafted foreign content' do
+      %w[iframe noembed noframes noscript plaintext script style xmp].each do |tag_name|
+        @s.fragment(%[<math><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/]).
+          must_equal ''
+
+        @s.fragment(%[<svg><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/]).
+          must_equal ''
+      end
+    end
+  end
 end
