From: Dawa Ometto <dawa.ometto@phil.uu.nl>
Date: Sun, 30 Sep 2018 12:35:42 +0200
Subject: [4/6] * Move libxml2 safety tests to test_sanitize * Only strip
 attribute when it concerns a url * Fix expected test output for image tag
 with empty src
Origin: https://github.com/rgrove/sanitize/commit/56bd15033fa6a9ce720172287e8fd947c62f6b13
Bug-Debian-Security: https://security-tracker.debian.org/tracker/CVE-2018-3740
Bug: https://github.com/rgrove/sanitize/issues/176
Bug-Debian: https://bugs.debian.org/893610

---
 lib/sanitize/transformers/clean_element.rb |  52 +++---
 test/common.rb                             |  36 ----
 test/test_clean_element.rb                 | 190 ---------------------
 test/test_malicious_html.rb                |  81 ---------
 test/test_sanitize.rb                      |  83 ++++++++-
 5 files changed, 109 insertions(+), 333 deletions(-)
 delete mode 100644 test/common.rb
 delete mode 100644 test/test_clean_element.rb
 delete mode 100644 test/test_malicious_html.rb

diff --git a/lib/sanitize/transformers/clean_element.rb b/lib/sanitize/transformers/clean_element.rb
index 07ed27f3b28c..53f5ccda5d1c 100644
--- a/lib/sanitize/transformers/clean_element.rb
+++ b/lib/sanitize/transformers/clean_element.rb
@@ -98,28 +98,6 @@ class Sanitize; module Transformers
               attr.unlink
             end
           end
-        
-          # Leading and trailing whitespace around URLs is ignored at parse
-          # time. Stripping it here prevents it from being escaped by the
-          # libxml2 workaround below.
-          stripped = attr.value.strip
-          attr.value = stripped unless stripped.empty?
-
-          # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
-          # attempt to preserve server-side includes. This can result in XSS since
-          # an unescaped double quote can allow an attacker to inject a
-          # non-whitelisted attribute.
-          #
-          # Sanitize works around this by implementing its own escaping for
-          # affected attributes, some of which can exist on any element and some
-          # of which can only exist on `<a>` elements.
-          #
-          # The relevant libxml2 code is here:
-          # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
-          if !stripped.empty? && (UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
-              (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name)))
-            attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
-          end
         end
 
         # Delete remaining attributes that use unacceptable protocols.
@@ -136,11 +114,37 @@ class Sanitize; module Transformers
               !protocol[attr_name].include?(:relative)
             end
 
-            attr.unlink if del
+            if del
+              attr.unlink
+            else
+              # Leading and trailing whitespace around URLs is ignored at parse
+              # time. Stripping it here prevents it from being escaped by the
+              # libxml2 workaround below.
+              attr.value = attr.value.strip
+            end
           end
         end
       end
 
+      # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
+      # attempt to preserve server-side includes. This can result in XSS since
+      # an unescaped double quote can allow an attacker to inject a
+      # non-whitelisted attribute.
+      #
+      # Sanitize works around this by implementing its own escaping for
+      # affected attributes, some of which can exist on any element and some
+      # of which can only exist on `<a>` elements.
+      #
+      # The relevant libxml2 code is here:
+      # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
+      node.attribute_nodes.each do |attr|
+        attr_name = attr.name.downcase
+        if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
+          (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
+            attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
+        end
+      end
+
       # Add required attributes.
       if @add_attributes.has_key?(name)
         @add_attributes[name].each {|key, val| node[key] = val }
@@ -148,4 +152,4 @@ class Sanitize; module Transformers
     end
   end
 
-end; end
\ No newline at end of file
+end; end
diff --git a/test/common.rb b/test/common.rb
deleted file mode 100644
index 36c4a7b1e251..000000000000
--- a/test/common.rb
+++ /dev/null
@@ -1,36 +0,0 @@
-# encoding: utf-8
-gem 'minitest'
-require 'minitest/autorun'
-require 'rubygems'
-
-
-require_relative '../lib/sanitize'
-
-# Helper to stub an instance method. Shamelessly stolen from
-# https://github.com/codeodor/minitest-stub_any_instance/
-class Object
-  def self.stub_instance(name, value, &block)
-    old_method = "__stubbed_method_#{name}__"
-
-    class_eval do
-      alias_method old_method, name
-
-      define_method(name) do |*args|
-        if value.respond_to?(:call) then
-          value.call(*args)
-        else
-          value
-        end
-      end
-    end
-
-    yield
-
-  ensure
-    class_eval do
-      undef_method name
-      alias_method name, old_method
-      undef_method old_method
-    end
-  end
-end
diff --git a/test/test_clean_element.rb b/test/test_clean_element.rb
deleted file mode 100644
index a50df9a06e9b..000000000000
--- a/test/test_clean_element.rb
+++ /dev/null
@@ -1,190 +0,0 @@
-# encoding: utf-8
-require_relative 'common'
-
-describe 'Sanitize::Transformers::CleanElement' do
-  make_my_diffs_pretty!
-  parallelize_me!
-
-  strings = {
-    :basic => {
-      :html       => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>',
-
-      :default    => 'Lorem ipsum dolor sit amet .foo { color: #fff; } alert("hello world");',
-      :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet .foo { color: #fff; } alert("hello world");',
-      :basic      => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet .foo { color: #fff; } alert("hello world");',
-      :relaxed    => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> alert("hello world");'
-    },
-
-    :malformed => {
-      :html       => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");',
-
-      :default    => 'Lorem dolor sit amet alert("hello world");',
-      :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
-      :basic      => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
-      :relaxed    => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
-    },
-
-    :unclosed => {
-      :html       => '<p>a</p><blockquote>b',
-
-      :default    => ' a  b ',
-      :restricted => ' a  b ',
-      :basic      => '<p>a</p><blockquote>b</blockquote>',
-      :relaxed    => '<p>a</p><blockquote>b</blockquote>'
-    },
-
-    :malicious => {
-      :html       => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>',
-
-      :default    => 'Lorem ipsum dolor sit amet &lt;script&gt;alert("hello world");',
-      :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert("hello world");',
-      :basic      => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");',
-      :relaxed    => '<b>Lorem</b> <a title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");'
-    }
-  }
-
-  protocols = {
-    'protocol-based JS injection: simple, no spaces' => {
-      :html       => '<a href="javascript:alert(\'XSS\');">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: simple, spaces before' => {
-      :html       => '<a href="javascript    :alert(\'XSS\');">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: simple, spaces after' => {
-      :html       => '<a href="javascript:    alert(\'XSS\');">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: simple, spaces before and after' => {
-      :html       => '<a href="javascript    :   alert(\'XSS\');">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: preceding colon' => {
-      :html       => '<a href=":javascript:alert(\'XSS\');">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: UTF-8 encoding' => {
-      :html       => '<a href="javascript&#58;">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: long UTF-8 encoding' => {
-      :html       => '<a href="javascript&#0058;">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: long UTF-8 encoding without semicolons' => {
-      :html       => '<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: hex encoding' => {
-      :html       => '<a href="javascript&#x3A;">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: long hex encoding' => {
-      :html       => '<a href="javascript&#x003A;">foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: hex encoding without semicolons' => {
-      :html       => '<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>',
-      :default    => 'foo',
-      :restricted => 'foo',
-      :basic      => '<a rel="nofollow">foo</a>',
-      :relaxed    => '<a>foo</a>'
-    },
-
-    'protocol-based JS injection: null char' => {
-      :html       => "<img src=java\0script:alert(\"XSS\")>",
-      :default    => '',
-      :restricted => '',
-      :basic      => '',
-      :relaxed    => '<img>'
-    },
-
-    'protocol-based JS injection: invalid URL char' => {
-      :html       => '<img src=java\script:alert("XSS")>',
-      :default    => '',
-      :restricted => '',
-      :basic      => '',
-      :relaxed    => '<img>'
-    },
-
-    'protocol-based JS injection: spaces and entities' => {
-      :html       => '<img src=" &#14;  javascript:alert(\'XSS\');">',
-      :default    => '',
-      :restricted => '',
-      :basic      => '',
-      :relaxed    => '<img>'
-    },
-
-    'protocol whitespace' => {
-      :html       => '<a href=" http://example.com/"></a>',
-      :default    => '',
-      :restricted => '',
-      :basic      => '<a href="http://example.com/" rel="nofollow"></a>',
-      :relaxed    => '<a href="http://example.com/"></a>'
-    }
-  }
-
-  describe 'Basic config' do
-    before do
-      @s = Sanitize.new(Sanitize::Config::BASIC)
-    end
-
-    it 'should not choke on valueless attributes' do
-      @s.clean('foo <a href>foo</a> bar')
-        .must_equal 'foo <a href rel="nofollow">foo</a> bar'
-    end
-  end
-
-  describe 'Custom configs' do
-    it "should not allow relative URLs when relative URLs aren't whitelisted" do
-      input = '<a href="/foo/bar">Link</a>'
-
-      Sanitize.clean(input,
-        :elements   => ['a'],
-        :attributes => {'a' => ['href']},
-        :protocols  => {'a' => {'href' => ['http']}}
-      ).must_equal '<a>Link</a>'
-    end
-  end
-end
diff --git a/test/test_malicious_html.rb b/test/test_malicious_html.rb
deleted file mode 100644
index af51235ed653..000000000000
--- a/test/test_malicious_html.rb
+++ /dev/null
@@ -1,81 +0,0 @@
-# encoding: utf-8
-require_relative 'common'
-
-# Miscellaneous attempts to sneak maliciously crafted HTML past Sanitize. Many
-# of these are courtesy of (or inspired by) the OWASP XSS Filter Evasion Cheat
-# Sheet.
-#
-# https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet
-
-describe 'Malicious HTML' do
-  make_my_diffs_pretty!
-  parallelize_me!
-
-  before do
-    @s = Sanitize.new(Sanitize::Config::RELAXED)
-  end
-
-  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
-  # attempt to preserve server-side includes. This can result in XSS since an
-  # unescaped double quote can allow an attacker to inject a non-whitelisted
-  # attribute. Sanitize works around this by implementing its own escaping for
-  # affected attributes.
-  #
-  # The relevant libxml2 code is here:
-  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
-  describe 'unsafe libxml2 server-side includes in attributes' do
-    tag_configs = [
-      {
-        tag_name: 'a',
-        escaped_attrs: %w[ action href src name ],
-        unescaped_attrs: []
-      },
-
-      {
-        tag_name: 'div',
-        escaped_attrs: %w[ action href src ],
-        unescaped_attrs: %w[ name ]
-      }
-    ]
-
-    before do
-      @s = Sanitize.new({
-        elements: %w[ a div ],
-
-        attributes: {
-          all: %w[ action href src name ]
-        }
-      })
-    end
-
-    tag_configs.each do |tag_config|
-      tag_name = tag_config[:tag_name]
-
-      tag_config[:escaped_attrs].each do |attr_name|
-        input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
-
-        it 'should escape unsafe characters in attributes' do
-          @s.clean(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
-        end
-
-        it 'should round-trip to the same output' do
-          output = @s.clean(input)
-          @s.clean(output).must_equal(output)
-        end
-      end
-
-      tag_config[:unescaped_attrs].each do |attr_name|
-        input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
-
-        it 'should not escape characters unnecessarily' do
-          @s.clean(input).must_equal(input)
-        end
-
-        it 'should round-trip to the same output' do
-          output = @s.clean(input)
-          @s.clean(output).must_equal(output)
-        end
-      end
-    end
-  end
-end
diff --git a/test/test_sanitize.rb b/test/test_sanitize.rb
index a0b48a932656..02b25206ebfc 100644
--- a/test/test_sanitize.rb
+++ b/test/test_sanitize.rb
@@ -1,3 +1,4 @@
+
 # encoding: utf-8
 #--
 # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
@@ -21,7 +22,11 @@
 # SOFTWARE.
 #++
 
-require_relative 'common'
+require 'rubygems'
+gem 'minitest'
+
+require 'minitest/autorun'
+require 'sanitize'
 
 strings = {
   :basic => {
@@ -177,7 +182,7 @@ tricky = {
     :default    => '',
     :restricted => '',
     :basic      => '',
-    :relaxed    => '<img src="">'
+    :relaxed    => '<img src>'
   }
 }
 
@@ -641,3 +646,77 @@ describe 'bugs' do
     Sanitize.clean!('foo <style>bar').must_equal('foo bar')
   end
 end
+
+describe 'Malicious HTML' do
+  make_my_diffs_pretty!
+  parallelize_me!
+
+  before do
+    @s = Sanitize.new(Sanitize::Config::RELAXED)
+  end
+
+  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
+  # attempt to preserve server-side includes. This can result in XSS since an
+  # unescaped double quote can allow an attacker to inject a non-whitelisted
+  # attribute. Sanitize works around this by implementing its own escaping for
+  # affected attributes.
+  #
+  # The relevant libxml2 code is here:
+  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
+  describe 'unsafe libxml2 server-side includes in attributes' do
+    tag_configs = [
+      {
+        tag_name: 'a',
+        escaped_attrs: %w[ action href src name ],
+        unescaped_attrs: []
+      },
+
+      {
+        tag_name: 'div',
+        escaped_attrs: %w[ action href src ],
+        unescaped_attrs: %w[ name ]
+      }
+    ]
+
+    before do
+      @s = Sanitize.new({
+        elements: %w[ a div ],
+
+        attributes: {
+          all: %w[ action href src name ]
+        }
+      })
+    end
+
+    tag_configs.each do |tag_config|
+      tag_name = tag_config[:tag_name]
+
+      tag_config[:escaped_attrs].each do |attr_name|
+        input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
+
+        it 'should escape unsafe characters in attributes' do
+          @s.clean(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
+        end
+
+        it 'should round-trip to the same output' do
+          output = @s.clean(input)
+          @s.clean(output).must_equal(output)
+        end
+      end
+
+      tag_config[:unescaped_attrs].each do |attr_name|
+        input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
+
+        it 'should not escape characters unnecessarily' do
+          @s.clean(input).must_equal(input)
+        end
+
+        it 'should round-trip to the same output' do
+          output = @s.clean(input)
+          @s.clean(output).must_equal(output)
+        end
+      end
+    end
+  end
+end
+
-- 
2.20.1

