1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
class Sanitize; module Transformers
class CleanElement
# Attributes that need additional escaping on `<a>` elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
name
])
# Attributes that need additional escaping on all elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
action
href
src
])
# Mapping of original characters to escape sequences for characters that
# should be escaped in attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_CHARS = {
' ' => '%20',
'"' => '%22'
}
# Regex that matches any single character that needs to be escaped in
# attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
def initialize(config)
@config = config
# For faster lookups.
@add_attributes = config[:add_attributes]
@allowed_elements = Set.new(config[:elements])
@attributes = config[:attributes]
@protocols = config[:protocols]
@remove_all_contents = false
@remove_element_contents = Set.new
@whitespace_elements = Set.new(config[:whitespace_elements])
if config[:remove_contents].is_a?(Array)
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
else
@remove_all_contents = !!config[:remove_contents]
end
end
def call(env)
name = env[:node_name]
node = env[:node]
return if env[:is_whitelisted] || !node.element?
# Delete any element that isn't in the config whitelist.
unless @allowed_elements.include?(name)
# Elements like br, div, p, etc. need to be replaced with whitespace in
# order to preserve readability.
if @whitespace_elements.include?(name)
node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))
unless node.children.empty?
node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
end
end
unless @remove_all_contents || @remove_element_contents.include?(name)
node.children.each {|n| node.add_previous_sibling(n) }
end
node.unlink
return
end
attr_whitelist = Set.new((@attributes[name] || []) +
(@attributes[:all] || []))
allow_data_attributes = attr_whitelist.include?(:data)
if attr_whitelist.empty?
# Delete all attributes from elements with no whitelisted attributes.
node.attribute_nodes.each {|attr| attr.unlink }
else
# Delete any attribute that isn't allowed on this element.
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
unless attr_whitelist.include?(attr_name)
# The attribute isn't explicitly whitelisted.
if allow_data_attributes && attr_name.start_with?('data-')
# Arbitrary data attributes are allowed. Verify that the attribute
# is a valid data attribute.
attr.unlink unless attr_name =~ REGEX_DATA_ATTR
else
# Either the attribute isn't a data attribute, or arbitrary data
# attributes aren't allowed. Remove the attribute.
attr.unlink
end
end
end
# Delete remaining attributes that use unacceptable protocols.
if @protocols.has_key?(name)
protocol = @protocols[name]
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
next false unless protocol.has_key?(attr_name)
del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
!protocol[attr_name].include?($1.downcase)
else
!protocol[attr_name].include?(:relative)
end
if del
attr.unlink
else
# Leading and trailing whitespace around URLs is ignored at parse
# time. Stripping it here prevents it from being escaped by the
# libxml2 workaround below.
attr.value = attr.value.strip
end
end
end
end
# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
# attempt to preserve server-side includes. This can result in XSS since
# an unescaped double quote can allow an attacker to inject a
# non-whitelisted attribute.
#
# Sanitize works around this by implementing its own escaping for
# affected attributes, some of which can exist on any element and some
# of which can only exist on `<a>` elements.
#
# The relevant libxml2 code is here:
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
end
end
# Add required attributes.
if @add_attributes.has_key?(name)
@add_attributes[name].each {|key, val| node[key] = val }
end
end
end
end; end
|