1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
|
# frozen_string_literal: true
module Loofah
#
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
#
# Traverse the document or fragment, invoking the +scrubber+ on each node.
#
# +scrubber+ must either be one of the symbols representing the built-in scrubbers (see
# Scrubbers), or a Scrubber instance.
#
# span2div = Loofah::Scrubber.new do |node|
# node.name = "div" if node.name == "span"
# end
# Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
# # => "<div>foo</div><p>bar</p>"
#
# or
#
# unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
# Loofah.html5_fragment(unsafe_html).scrub!(:strip).to_s
# # => "ohai! <div>div is safe</div> "
#
# Note that this method is called implicitly from the shortcuts Loofah.scrub_html5_fragment et
# al.
#
# Please see Scrubber for more information on implementation and traversal, and README.rdoc for
# more example usage.
#
module ScrubBehavior
module Node # :nodoc:
def scrub!(scrubber)
#
# yes. this should be three separate methods. but nokogiri decorates (or not) based on
# whether the module name has already been included. and since documents get decorated just
# like their constituent nodes, we need to jam all the logic into a single module.
#
scrubber = ScrubBehavior.resolve_scrubber(scrubber)
case self
when Nokogiri::XML::Document
scrubber.traverse(root) if root
when Nokogiri::XML::DocumentFragment
children.scrub!(scrubber)
else
scrubber.traverse(self)
end
self
end
end
module NodeSet # :nodoc:
def scrub!(scrubber)
each { |node| node.scrub!(scrubber) }
self
end
end
class << self
def resolve_scrubber(scrubber) # :nodoc:
scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
unless scrubber.is_a?(Loofah::Scrubber)
raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
end
scrubber
end
end
end
#
# Overrides +text+ in Document and DocumentFragment classes, and mixes in +to_text+.
#
module TextBehavior
#
# Returns a plain-text version of the markup contained by the document, with HTML entities
# encoded.
#
# This method is significantly faster than #to_text, but isn't clever about whitespace around
# block elements.
#
# Loofah.html5_document("<h1>Title</h1><div>Content</div>").text
# # => "TitleContent"
#
# By default, the returned text will have HTML entities escaped. If you want unescaped
# entities, and you understand that the result is unsafe to render in a browser, then you can
# pass an argument as shown:
#
# frag = Loofah.html5_fragment("<script>alert('EVIL');</script>")
# # ok for browser:
# frag.text # => "<script>alert('EVIL');</script>"
# # decidedly not ok for browser:
# frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
#
def text(options = {})
result = if serialize_root
serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
else
""
end
if options[:encode_special_chars] == false
result # possibly dangerous if rendered in a browser
else
encode_special_chars(result)
end
end
alias_method :inner_text, :text
alias_method :to_str, :text
#
# Returns a plain-text version of the markup contained by the fragment, with HTML entities
# encoded.
#
# This method is slower than #text, but is clever about whitespace around block elements and
# line break elements.
#
# Loofah.html5_document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
# # => "\nTitle\n\nContent\nNext line\n"
#
def to_text(options = {})
Loofah.remove_extraneous_whitespace(dup.scrub!(:newline_block_elements).text(options))
end
end
module DocumentDecorator # :nodoc:
def initialize(*args, &block)
super
decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
end
end
module HtmlDocumentBehavior # :nodoc:
module ClassMethods
def parse(*args, &block)
remove_comments_before_html_element(super)
end
private
# remove comments that exist outside of the HTML element.
#
# these comments are allowed by the HTML spec:
#
# https://www.w3.org/TR/html401/struct/global.html#h-7.1
#
# but are not scrubbed by Loofah because these nodes don't meet
# the contract that scrubbers expect of a node (e.g., it can be
# replaced, sibling and children nodes can be created).
def remove_comments_before_html_element(doc)
doc.children.each do |child|
child.unlink if child.comment?
end
doc
end
end
class << self
def included(base)
base.extend(ClassMethods)
end
end
def serialize_root
at_xpath("/html/body")
end
end
module HtmlFragmentBehavior # :nodoc:
module ClassMethods
def parse(tags, encoding = nil)
doc = document_klass.new
encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : "UTF-8"
doc.encoding = encoding
new(doc, tags)
end
def document_klass
@document_klass ||= if Loofah.html5_support? && self == Loofah::HTML5::DocumentFragment
Loofah::HTML5::Document
elsif self == Loofah::HTML4::DocumentFragment
Loofah::HTML4::Document
else
raise ArgumentError, "unexpected class: #{self}"
end
end
end
class << self
def included(base)
base.extend(ClassMethods)
end
end
def to_s
serialize_root.children.to_s
end
alias_method :serialize, :to_s
def serialize_root
at_xpath("./body") || self
end
end
end
|