1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
|
#encoding: US-ASCII
require 'cgi'
module Loofah
module HTML5 # :nodoc:
module Scrub
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
class << self
def allowed_element? element_name
::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
end
# alternative implementation of the html5lib attribute scrubbing algorithm
def scrub_attributes node
node.attribute_nodes.each do |attr_node|
attr_name = if attr_node.namespace
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
else
attr_node.node_name
end
if attr_name =~ /\Adata-[\w-]+\z/
next
end
unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
attr_node.remove
next
end
if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
# this block lifted nearly verbatim from HTML5 sanitization
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
attr_node.remove
next
end
end
if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
end
if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
attr_node.remove
next
end
end
scrub_css_attribute node
node.attribute_nodes.each do |attr_node|
node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
end
force_correct_attribute_escaping! node
end
def scrub_css_attribute node
style = node.attributes['style']
style.value = scrub_css(style.value) if style
end
# lifted nearly verbatim from html5lib
def scrub_css style
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /\A([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*\z/
return '' unless style =~ /\A\s*([-\w]+\s*:[^:;]*(;\s*|$))*\z/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if WhiteList::ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) &&
keyword !~ /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
end
elsif WhiteList::ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end
end
style = clean.join(' ')
end
def force_correct_attribute_escaping! node
return unless Nokogiri::VersionInfo.instance.libxml2?
node.attribute_nodes.each do |attr_node|
next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
next unless tag_name.nil? || tag_name == node.name
encoding = attr_node.value.encoding
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
'%' + m.unpack('H2' * m.bytesize).join('%').upcase
end.force_encoding(encoding)
end
end
end
end
end
end
|