1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
|
require 'nokogumbo/html5/document'
require 'nokogumbo/html5/document_fragment'
require 'nokogumbo/html5/node'
module Nokogiri
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
def self.HTML5(string_or_io, url = nil, encoding = nil, **options, &block)
Nokogiri::HTML5::Document.parse(string_or_io, url, encoding, **options, &block)
end
module HTML5
# HTML uses the XHTML namespace.
HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
def self.parse(string, url = nil, encoding = nil, **options, &block)
Document.parse(string, url, encoding, **options, &block)
end
# Parse a fragment from +string+. Convenience method for
# Nokogiri::HTML5::DocumentFragment.parse.
def self.fragment(string, encoding = nil, **options)
DocumentFragment.parse(string, encoding, options)
end
# Fetch and parse a HTML document from the web, following redirects,
# handling https, and determining the character encoding using HTML5
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
# http headers and special options. Everything which is not a
# special option is considered a header. Special options include:
# * :follow_limit => number of redirects which are followed
# * :basic_auth => [username, password]
def self.get(uri, options={})
headers = options.clone
headers = {:follow_limit => headers} if Numeric === headers # deprecated
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
require 'net/http'
uri = URI(uri) unless URI === uri
http = Net::HTTP.new(uri.host, uri.port)
# TLS / SSL support
http.use_ssl = true if uri.scheme == 'https'
# Pass through Net::HTTP override values, which currently include:
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
# :verify_callback, :verify_depth, :verify_mode
options.each do |key, value|
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
end
request = Net::HTTP::Get.new(uri.request_uri)
# basic authentication
auth = headers.delete(:basic_auth)
auth ||= [uri.user, uri.password] if uri.user && uri.password
request.basic_auth auth.first, auth.last if auth
# remaining options are treated as headers
headers.each {|key, value| request[key.to_s] = value.to_s}
response = http.request(request)
case response
when Net::HTTPSuccess
doc = parse(reencode(response.body, response['content-type']), options)
doc.instance_variable_set('@response', response)
doc.class.send(:attr_reader, :response)
doc
when Net::HTTPRedirection
response.value if limit <= 1
location = URI.join(uri, response['location'])
get(location, options.merge(:follow_limit => limit-1))
else
response.value
end
end
private
def self.read_and_encode(string, encoding)
# Read the string with the given encoding.
if string.respond_to?(:read)
if encoding.nil?
string = string.read
else
string = string.read(encoding: encoding)
end
else
# Otherwise the string has the given encoding.
string = string.to_s
if encoding
string = string.dup
string.force_encoding(encoding)
end
end
# convert to UTF-8
if string.encoding != Encoding::UTF_8
string = reencode(string)
end
string
end
# Charset sniffing is a complex and controversial topic that understandably
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
# it is a very real problem for consumers of HTML as the default for HTML
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
# *only* supports utf-8.
#
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
# while attempting to more closely follow the HTML5 standard.
#
# http://bugs.ruby-lang.org/issues/2567
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
#
def self.reencode(body, content_type=nil)
if body.encoding == Encoding::ASCII_8BIT
encoding = nil
# look for a Byte Order Mark (BOM)
initial_bytes = body[0..2].bytes
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
encoding = Encoding::UTF_8
elsif initial_bytes[0..1] == [0xFE, 0xFF]
encoding = Encoding::UTF_16BE
elsif initial_bytes[0..1] == [0xFF, 0xFE]
encoding = Encoding::UTF_16LE
end
# look for a charset in a content-encoding header
if content_type
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
end
# look for a charset in a meta tag in the first 1024 bytes
if not encoding
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
data.scan(/<meta.*?>/m).each do |meta|
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
end
end
# if all else fails, default to the official default encoding for HTML
encoding ||= Encoding::ISO_8859_1
# change the encoding to match the detected or inferred encoding
body = body.dup
begin
body.force_encoding(encoding)
rescue ArgumentError
body.force_encoding(Encoding::ISO_8859_1)
end
end
body.encode(Encoding::UTF_8)
end
def self.serialize_node_internal(current_node, io, encoding, options)
case current_node.type
when XML::Node::ELEMENT_NODE
ns = current_node.namespace
ns_uri = ns.nil? ? nil : ns.href
# XXX(sfc): attach namespaces to all nodes, even html?
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
tagname = current_node.name
else
tagname = "#{ns.prefix}:#{current_node.name}"
end
io << '<' << tagname
current_node.attribute_nodes.each do |attr|
attr_ns = attr.namespace
if attr_ns.nil?
attr_name = attr.name
else
ns_uri = attr_ns.href
if ns_uri == XML_NAMESPACE
attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
attr_name = 'xmlns'
elsif ns_uri == XMLNS_NAMESPACE
attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
elsif ns_uri == XLINK_NAMESPACE
attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
else
attr_name = "#{attr_ns.prefix}:#{attr.name}"
end
end
io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
end
io << '>'
if !%w[area base basefont bgsound br col embed frame hr img input keygen
link meta param source track wbr].include?(current_node.name)
io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
current_node.children.each do |child|
# XXX(sfc): Templates handled specially?
serialize_node_internal(child, io, encoding, options)
end
io << '</' << tagname << '>'
end
when XML::Node::TEXT_NODE
parent = current_node.parent
if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
io << current_node.content
else
io << escape_text(current_node.content, encoding, false)
end
when XML::Node::CDATA_SECTION_NODE
io << '<![CDATA[' << current_node.content << ']]>'
when XML::Node::COMMENT_NODE
io << '<!--' << current_node.content << '-->'
when XML::Node::PI_NODE
io << '<?' << current_node.content << '>'
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
io << '<!DOCTYPE ' << current_node.name << '>'
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
current_node.children.each do |child|
serialize_node_internal(child, io, encoding, options)
end
else
raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
end
end
def self.escape_text(text, encoding, attribute_mode)
if attribute_mode
text = text.gsub(/[&\u00a0"]/,
'&' => '&', "\u00a0" => ' ', '"' => '"')
else
text = text.gsub(/[&\u00a0<>]/,
'&' => '&', "\u00a0" => ' ', '<' => '<', '>' => '>')
end
# Not part of the standard
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
end
def self.prepend_newline?(node)
return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
first_child = node.children[0]
first_child.text? && first_child.content.start_with?("\n")
end
end
end
|