File: whitewash.rb

package info (click to toggle)
ruby-whitewash 2.1-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 220 kB
  • sloc: ruby: 1,217; makefile: 3
file content (130 lines) | stat: -rw-r--r-- 3,407 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Whitewash: whitelist-based HTML validator for Ruby
# (originally written for Samizdat project)
#
#   Copyright (c) 2002-2012, 2016  Dmitry Borodaenko <angdraug@debian.org>
#
#   This program is free software.
#   You can distribute/modify this program under the terms of
#   the GNU General Public License version 3 or later.
#
# vim: et sw=2 sts=2 ts=8 tw=0

require 'rbconfig'
require 'nokogiri'
require 'yaml'

class WhitewashError < RuntimeError; end

class Whitewash

  if RUBY_VERSION >= '2.0' or RUBY_VERSION < '1.9.3'
    def Whitewash.load(string)
      YAML.unsafe_load(string)
    end

  else
    # use Syck to parse the whitelist to work around Psych issue #36 that was
    # present in some versions of Ruby 1.9.3
    #
    def Whitewash.load(string)
      Mutex.new.synchronize do
        yamler = YAML::ENGINE.yamler
        YAML::ENGINE.yamler = 'syck'
        whitelist = YAML.load(string)
        YAML::ENGINE.yamler = yamler
        whitelist
      end
    end
  end

  def Whitewash.default_whitelist
    unless found = PATH.find {|dir| File.readable?(File.join(dir, WHITELIST)) }
      raise RuntimeError, "Can't find default whitelist"
    end
    File.open(File.join(found, WHITELIST)) {|f| Whitewash.load(f.read) }
  end

  # _whitelist_ is expected to be loaded from xhtml.yaml.
  #
  def initialize(whitelist = Whitewash.default_whitelist)
    @whitelist = whitelist
  end

  attr_reader :xhtml

  CSS = Regexp.new(%r{
    \A\s*
    ([-a-z0-9]+) : \s*
    (?: (?: [-./a-z0-9]+ | \#[0-9a-f]+ | [0-9]+% ) \s* ) +
    \s*\z
  }xi).freeze

  def check_style(whitelist, style)
    css = whitelist['_css'] or return true
    style.split(';').each do |s|
      return false unless
        s =~ CSS and css.include? $1
    end
    true
  end

  # compare elements and attributes with the whitelist
  #
  def sanitize_element(xml, whitelist = @whitelist, &p)
    if xml.name =~ /^_/ or not whitelist.keys.include?(xml.name)
      xml.element_children.each {|e| sanitize_element(e, whitelist, &p) }
      xml.replace(xml.children)
      return
    end

    # sanitize CSS in <style> elements
    if 'style' == xml.name and not check_style(whitelist, xml.content)
      xml.remove
      return
    end

    xml.attribute_nodes.each do |a|
      attrs ||= whitelist['_common'].merge((whitelist[xml.name] or {}))
      unless attrs[a.name] === a.to_s
        xml.remove_attribute(a.name)
        next
      end

      # sanitize CSS in style="" attributes
      if 'style' == a.name and not check_style(whitelist, a.value)
        xml.remove_attribute(a.name)
        next
      end
    end

    # recurse
    xml.element_children.each {|e| sanitize_element(e, whitelist, &p) }

    if block_given?
      yield xml
    end
  end

  # Return sanitized HTML.
  #
  # If block is supplied, it will be invoked for each Nokogiri::XML::Element
  # in the sanitized HTML.
  #
  def sanitize(html, whitelist = @whitelist, &p)
    xml = Nokogiri::HTML(html) {|config| config.noblanks }
    xml = xml.xpath('//html/body').first
    return '' if xml.nil?

    sanitize_element(xml, whitelist, &p)
    xml.children.to_xhtml
  end

  private

  PATH = [ '/etc/ruby-whitewash',
           File.join(RbConfig::CONFIG['datadir'], 'ruby-whitewash'),
           '/usr/local/share/ruby-whitewash/',
           File.expand_path('../data/whitewash/', File.dirname(__FILE__)) ]

  WHITELIST = 'whitelist.yaml'
end