File: clean_element.rb

package info (click to toggle)
ruby-sanitize 2.1.0-2%2Bdeb9u1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 412 kB
  • sloc: ruby: 860; makefile: 4
file content (155 lines) | stat: -rw-r--r-- 5,375 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
class Sanitize; module Transformers

  class CleanElement

    # Attributes that need additional escaping on `<a>` elements due to unsafe
    # libxml2 behavior.
    UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
      name
    ])

    # Attributes that need additional escaping on all elements due to unsafe
    # libxml2 behavior.
    UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
      action
      href
      src
    ])

    # Mapping of original characters to escape sequences for characters that
    # should be escaped in attributes affected by unsafe libxml2 behavior.
    UNSAFE_LIBXML_ESCAPE_CHARS = {
      ' ' => '%20',
      '"' => '%22'
    }

    # Regex that matches any single character that needs to be escaped in
    # attributes affected by unsafe libxml2 behavior.
    UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/

    def initialize(config)
      @config = config

      # For faster lookups.
      @add_attributes          = config[:add_attributes]
      @allowed_elements        = Set.new(config[:elements])
      @attributes              = config[:attributes]
      @protocols               = config[:protocols]
      @remove_all_contents     = false
      @remove_element_contents = Set.new
      @whitespace_elements     = Set.new(config[:whitespace_elements])

      if config[:remove_contents].is_a?(Array)
        @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
      else
        @remove_all_contents = !!config[:remove_contents]
      end
    end

    def call(env)
      name = env[:node_name]
      node = env[:node]

      return if env[:is_whitelisted] || !node.element?

      # Delete any element that isn't in the config whitelist.
      unless @allowed_elements.include?(name)
        # Elements like br, div, p, etc. need to be replaced with whitespace in
        # order to preserve readability.
        if @whitespace_elements.include?(name)
          node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))

          unless node.children.empty?
            node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
          end
        end

        unless @remove_all_contents || @remove_element_contents.include?(name)
          node.children.each {|n| node.add_previous_sibling(n) }
        end

        node.unlink
        return
      end

      attr_whitelist = Set.new((@attributes[name] || []) +
          (@attributes[:all] || []))

      allow_data_attributes = attr_whitelist.include?(:data)

      if attr_whitelist.empty?
        # Delete all attributes from elements with no whitelisted attributes.
        node.attribute_nodes.each {|attr| attr.unlink }
      else
        # Delete any attribute that isn't allowed on this element.
        node.attribute_nodes.each do |attr|
          attr_name = attr.name.downcase

          unless attr_whitelist.include?(attr_name)
            # The attribute isn't explicitly whitelisted.

            if allow_data_attributes && attr_name.start_with?('data-')
              # Arbitrary data attributes are allowed. Verify that the attribute
              # is a valid data attribute.
              attr.unlink unless attr_name =~ REGEX_DATA_ATTR
            else
              # Either the attribute isn't a data attribute, or arbitrary data
              # attributes aren't allowed. Remove the attribute.
              attr.unlink
            end
          end
        end

        # Delete remaining attributes that use unacceptable protocols.
        if @protocols.has_key?(name)
          protocol = @protocols[name]

          node.attribute_nodes.each do |attr|
            attr_name = attr.name.downcase
            next false unless protocol.has_key?(attr_name)

            del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
              !protocol[attr_name].include?($1.downcase)
            else
              !protocol[attr_name].include?(:relative)
            end

            if del
              attr.unlink
            else
              # Leading and trailing whitespace around URLs is ignored at parse
              # time. Stripping it here prevents it from being escaped by the
              # libxml2 workaround below.
              attr.value = attr.value.strip
            end
          end
        end
      end

      # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
      # attempt to preserve server-side includes. This can result in XSS since
      # an unescaped double quote can allow an attacker to inject a
      # non-whitelisted attribute.
      #
      # Sanitize works around this by implementing its own escaping for
      # affected attributes, some of which can exist on any element and some
      # of which can only exist on `<a>` elements.
      #
      # The relevant libxml2 code is here:
      # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
      node.attribute_nodes.each do |attr|
        attr_name = attr.name.downcase
        if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
          (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
            attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
        end
      end

      # Add required attributes.
      if @add_attributes.has_key?(name)
        @add_attributes[name].each {|key, val| node[key] = val }
      end
    end
  end

end; end