1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
# Whitewash: whitelist-based HTML validator for Ruby
# (originally written for Samizdat project)
#
# Copyright (c) 2002-2012, 2016 Dmitry Borodaenko <angdraug@debian.org>
#
# This program is free software.
# You can distribute/modify this program under the terms of
# the GNU General Public License version 3 or later.
#
# vim: et sw=2 sts=2 ts=8 tw=0
require 'rbconfig'
require 'nokogiri'
require 'yaml'
class WhitewashError < RuntimeError; end
class Whitewash
if RUBY_VERSION >= '2.0' or RUBY_VERSION < '1.9.3'
def Whitewash.load(string)
YAML.unsafe_load(string)
end
else
# use Syck to parse the whitelist to work around Psych issue #36 that was
# present in some versions of Ruby 1.9.3
#
def Whitewash.load(string)
Mutex.new.synchronize do
yamler = YAML::ENGINE.yamler
YAML::ENGINE.yamler = 'syck'
whitelist = YAML.load(string)
YAML::ENGINE.yamler = yamler
whitelist
end
end
end
def Whitewash.default_whitelist
unless found = PATH.find {|dir| File.readable?(File.join(dir, WHITELIST)) }
raise RuntimeError, "Can't find default whitelist"
end
File.open(File.join(found, WHITELIST)) {|f| Whitewash.load(f.read) }
end
# _whitelist_ is expected to be loaded from xhtml.yaml.
#
def initialize(whitelist = Whitewash.default_whitelist)
@whitelist = whitelist
end
attr_reader :xhtml
CSS = Regexp.new(%r{
\A\s*
([-a-z0-9]+) : \s*
(?: (?: [-./a-z0-9]+ | \#[0-9a-f]+ | [0-9]+% ) \s* ) +
\s*\z
}xi).freeze
def check_style(whitelist, style)
css = whitelist['_css'] or return true
style.split(';').each do |s|
return false unless
s =~ CSS and css.include? $1
end
true
end
# compare elements and attributes with the whitelist
#
def sanitize_element(xml, whitelist = @whitelist, &p)
if xml.name =~ /^_/ or not whitelist.keys.include?(xml.name)
xml.element_children.each {|e| sanitize_element(e, whitelist, &p) }
xml.replace(xml.children)
return
end
# sanitize CSS in <style> elements
if 'style' == xml.name and not check_style(whitelist, xml.content)
xml.remove
return
end
xml.attribute_nodes.each do |a|
attrs ||= whitelist['_common'].merge((whitelist[xml.name] or {}))
unless attrs[a.name] === a.to_s
xml.remove_attribute(a.name)
next
end
# sanitize CSS in style="" attributes
if 'style' == a.name and not check_style(whitelist, a.value)
xml.remove_attribute(a.name)
next
end
end
# recurse
xml.element_children.each {|e| sanitize_element(e, whitelist, &p) }
if block_given?
yield xml
end
end
# Return sanitized HTML.
#
# If block is supplied, it will be invoked for each Nokogiri::XML::Element
# in the sanitized HTML.
#
def sanitize(html, whitelist = @whitelist, &p)
xml = Nokogiri::HTML(html) {|config| config.noblanks }
xml = xml.xpath('//html/body').first
return '' if xml.nil?
sanitize_element(xml, whitelist, &p)
xml.children.to_xhtml
end
private
PATH = [ '/etc/ruby-whitewash',
File.join(RbConfig::CONFIG['datadir'], 'ruby-whitewash'),
'/usr/local/share/ruby-whitewash/',
File.expand_path('../data/whitewash/', File.dirname(__FILE__)) ]
WHITELIST = 'whitelist.yaml'
end
|