From: Ryan Grove <ryan@wonko.com>
Date: Sat, 13 Oct 2018 17:47:21 -0700
Subject: feat: Remove useless filtered element content by default
Origin: https://github.com/rgrove/sanitize/commit/faf9a0f432fda3cef29f0f8aad99d4dedf079d67

[Salvatore Bonaccorso: Backport to 4.6.6 for context changes]
---
 lib/sanitize/config/default.rb             |  4 +-
 lib/sanitize/transformers/clean_element.rb |  6 +-
 test/test_clean_comment.rb                 |  6 +-
 test/test_clean_element.rb                 | 64 +++++++++++++++++-----
 test/test_malicious_html.rb                |  6 +-
 test/test_parser.rb                        |  4 +-
 test/test_sanitize.rb                      |  6 +-
 7 files changed, 66 insertions(+), 30 deletions(-)

--- a/lib/sanitize/config/default.rb
+++ b/lib/sanitize/config/default.rb
@@ -69,7 +69,9 @@ class Sanitize
       # If this is an Array or Set of element names, then only the contents of
       # the specified elements (when filtered) will be removed, and the contents
       # of all other filtered elements will be left behind.
-      :remove_contents => false,
+      :remove_contents => %w[
+        iframe noembed noframes noscript script style
+      ],
 
       # Transformers allow you to filter or alter nodes using custom logic. See
       # README.md for details and examples.
--- a/lib/sanitize/transformers/clean_element.rb
+++ b/lib/sanitize/transformers/clean_element.rb
@@ -97,8 +97,10 @@ class Sanitize; module Transformers; cla
         end
       end
 
-      unless @remove_all_contents || @remove_element_contents.include?(name)
-        node.add_previous_sibling(node.children)
+      unless node.children.empty?
+        unless @remove_all_contents || @remove_element_contents.include?(name)
+          node.add_previous_sibling(node.children)
+        end
       end
 
       node.unlink
--- a/test/test_clean_comment.rb
+++ b/test/test_clean_comment.rb
@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanC
 
       # Special case: the comment markup is inside a <script>, which makes it
       # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
+      @s.fragment("<script><!-- comment --></script>").must_equal ''
 
       Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
         .must_equal '<script><!-- comment --></script>'
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanC
       @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
       @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
 
-      # Special case: the comment markup is inside a <script>, which makes it
-      # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
-
       Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
         .must_equal '<script><!-- comment --></script>'
     end
--- a/test/test_clean_element.rb
+++ b/test/test_clean_element.rb
@@ -8,25 +8,22 @@ describe 'Sanitize::Transformers::CleanE
   strings = {
     :basic => {
       :html       => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>',
-
-      :default    => 'Lorem ipsum dolor sit amet .foo { color: #fff; } alert("hello world");',
-      :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet .foo { color: #fff; } alert("hello world");',
-      :basic      => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet .foo { color: #fff; } alert("hello world");',
-      :relaxed    => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> alert("hello world");'
+      :default    => 'Lorem ipsum dolor sit amet  ',
+      :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet  ',
+      :basic      => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet  ',
+      :relaxed    => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> '
     },
 
     :malformed => {
       :html       => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");',
-
-      :default    => 'Lorem dolor sit amet alert("hello world");',
-      :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
-      :basic      => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
-      :relaxed    => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
+      :default    => 'Lorem dolor sit amet ',
+      :restricted => 'Lorem <strong>dolor</strong> sit amet ',
+      :basic      => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
+      :relaxed    => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet ',
     },
 
     :unclosed => {
       :html       => '<p>a</p><blockquote>b',
-
       :default    => ' a  b ',
       :restricted => ' a  b ',
       :basic      => '<p>a</p><blockquote>b</blockquote>',
@@ -35,7 +32,6 @@ describe 'Sanitize::Transformers::CleanE
 
     :malicious => {
       :html       => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>',
-
       :default    => 'Lorem ipsum dolor sit amet &lt;script&gt;alert("hello world");',
       :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert("hello world");',
       :basic      => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");',
@@ -171,10 +167,10 @@ describe 'Sanitize::Transformers::CleanE
         .must_equal 'foo bar baz quux'
 
       Sanitize.fragment('<script>alert("<xss>");</script>')
-        .must_equal 'alert("&lt;xss&gt;");'
+        .must_equal ''
 
       Sanitize.fragment('<<script>script>alert("<xss>");</<script>>')
-        .must_equal '&lt;script&gt;alert("&lt;xss&gt;");&lt;/&lt;script&gt;&gt;'
+        .must_equal '&lt;'
 
       Sanitize.fragment('< script <>> alert("<xss>");</script>')
         .must_equal '&lt; script &lt;&gt;&gt; alert("");'
@@ -196,6 +192,46 @@ describe 'Sanitize::Transformers::CleanE
         .must_equal ''
     end
 
+    it 'should escape the content of removed `plaintext` elements' do
+      Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
+        .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
+    end
+
+    it 'should escape the content of removed `xmp` elements' do
+      Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
+        .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
+    end
+
+    it 'should not preserve the content of removed `iframe` elements' do
+      Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
+        .must_equal ''
+    end
+
+    it 'should not preserve the content of removed `noembed` elements' do
+      Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
+        .must_equal ''
+    end
+
+    it 'should not preserve the content of removed `noframes` elements' do
+      Sanitize.fragment('<noframes>hello! <script>alert(0)</script></noframes>')
+        .must_equal ''
+    end
+
+    it 'should not preserve the content of removed `noscript` elements' do
+      Sanitize.fragment('<noscript>hello! <script>alert(0)</script></noscript>')
+        .must_equal ''
+    end
+
+    it 'should not preserve the content of removed `script` elements' do
+      Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
+        .must_equal ''
+    end
+
+    it 'should not preserve the content of removed `style` elements' do
+      Sanitize.fragment('<style>hello! <script>alert(0)</script></style>')
+        .must_equal ''
+    end
+
     strings.each do |name, data|
       it "should clean #{name} HTML" do
         Sanitize.fragment(data[:html]).must_equal(data[:default])
--- a/test/test_malicious_html.rb
+++ b/test/test_malicious_html.rb
@@ -65,7 +65,7 @@ describe 'Malicious HTML' do
 
     it 'should not be possible to inject <script> via a malformed <img> tag' do
       @s.fragment('<img """><script>alert("XSS")</script>">').
-        must_equal '<img>alert("XSS")"&gt;'
+        must_equal '<img>"&gt;'
     end
 
     it 'should not be possible to inject protocol-based JS' do
@@ -117,12 +117,12 @@ describe 'Malicious HTML' do
   describe '<script>' do
     it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
       @s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
-        must_equal 'alert(1)'
+        must_equal ''
     end
 
     it 'should not be possible to inject <script> via extraneous open brackets' do
       @s.fragment(%[<<script>alert("XSS");//<</script>]).
-        must_equal '&lt;alert("XSS");//&lt;'
+        must_equal '&lt;'
     end
   end
 
--- a/test/test_parser.rb
+++ b/test/test_parser.rb
@@ -19,8 +19,8 @@ describe 'Parser' do
   end
 
   it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
-    Sanitize.fragment('foo <script>bar').must_equal 'foo bar'
-    Sanitize.fragment('foo <style>bar').must_equal 'foo bar'
+    Sanitize.fragment('foo <script>bar').must_equal 'foo '
+    Sanitize.fragment('foo <style>bar').must_equal 'foo '
   end
 
   it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
--- a/test/test_sanitize.rb
+++ b/test/test_sanitize.rb
@@ -25,7 +25,7 @@ describe 'Sanitize' do
 
       it 'should sanitize an HTML document' do
         @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
-          .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
+          .must_equal "<html>Lorem ipsum dolor sit amet </html>\n"
       end
 
       it 'should not modify the input string' do
@@ -42,7 +42,7 @@ describe 'Sanitize' do
     describe '#fragment' do
       it 'should sanitize an HTML fragment' do
         @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
-          .must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
+          .must_equal 'Lorem ipsum dolor sit amet '
       end
 
       it 'should not modify the input string' do
@@ -71,7 +71,7 @@ describe 'Sanitize' do
         doc.xpath('/html/body/node()').each {|node| frag << node }
 
         @s.node!(frag)
-        frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
+        frag.to_html.must_equal 'Lorem ipsum dolor sit amet '
       end
 
       describe "when the given node is a document and <html> isn't whitelisted" do
