File: html5.rb

package info (click to toggle)
ruby-nokogumbo 2.0.5-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,372 kB
  • sloc: ansic: 32,909; ruby: 452; makefile: 7
file content (252 lines) | stat: -rw-r--r-- 9,798 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
require 'nokogumbo/html5/document'
require 'nokogumbo/html5/document_fragment'
require 'nokogumbo/html5/node'

module Nokogiri
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
  def self.HTML5(string_or_io, url = nil, encoding = nil, **options, &block)
    Nokogiri::HTML5::Document.parse(string_or_io, url, encoding, **options, &block)
  end

  module HTML5
    # HTML uses the XHTML namespace.
    HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
    MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
    SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
    XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
    XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
    XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze

    # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
    def self.parse(string, url = nil, encoding = nil, **options, &block)
      Document.parse(string, url, encoding, **options, &block)
    end

    # Parse a fragment from +string+. Convenience method for
    # Nokogiri::HTML5::DocumentFragment.parse.
    def self.fragment(string, encoding = nil, **options)
      DocumentFragment.parse(string, encoding, options)
    end

    # Fetch and parse a HTML document from the web, following redirects,
    # handling https, and determining the character encoding using HTML5
    # rules.  +uri+ may be a +String+ or a +URI+.  +options+ contains
    # http headers and special options.  Everything which is not a
    # special option is considered a header.  Special options include:
    #  * :follow_limit => number of redirects which are followed
    #  * :basic_auth => [username, password]
    def self.get(uri, options={})
      headers = options.clone
      headers = {:follow_limit => headers} if Numeric === headers # deprecated
      limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10

      require 'net/http'
      uri = URI(uri) unless URI === uri

      http = Net::HTTP.new(uri.host, uri.port)

      # TLS / SSL support
      http.use_ssl = true if uri.scheme == 'https'

      # Pass through Net::HTTP override values, which currently include:
      #   :ca_file, :ca_path, :cert, :cert_store, :ciphers,
      #   :close_on_empty_response, :continue_timeout, :key, :open_timeout,
      #   :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
      #   :verify_callback, :verify_depth, :verify_mode
      options.each do |key, value|
        http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
      end

      request = Net::HTTP::Get.new(uri.request_uri)

      # basic authentication
      auth = headers.delete(:basic_auth)
      auth ||= [uri.user, uri.password] if uri.user && uri.password
      request.basic_auth auth.first, auth.last if auth

      # remaining options are treated as headers
      headers.each {|key, value| request[key.to_s] = value.to_s}

      response = http.request(request)

      case response
      when Net::HTTPSuccess
        doc = parse(reencode(response.body, response['content-type']), options)
        doc.instance_variable_set('@response', response)
        doc.class.send(:attr_reader, :response)
        doc
      when Net::HTTPRedirection
        response.value if limit <= 1
        location = URI.join(uri, response['location'])
        get(location, options.merge(:follow_limit => limit-1))
      else
        response.value
      end
    end

    private

    def self.read_and_encode(string, encoding)
      # Read the string with the given encoding.
      if string.respond_to?(:read)
        if encoding.nil?
          string = string.read
        else
          string = string.read(encoding: encoding)
        end
      else
        # Otherwise the string has the given encoding.
        string = string.to_s
        if encoding
          string = string.dup
          string.force_encoding(encoding)
        end
      end

      # convert to UTF-8
      if string.encoding != Encoding::UTF_8
        string = reencode(string)
      end
      string
    end

    # Charset sniffing is a complex and controversial topic that understandably
    # isn't done _by default_ by the Ruby Net::HTTP library.  This being said,
    # it is a very real problem for consumers of HTML as the default for HTML
    # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
    # *only* supports utf-8.
    #
    # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
    # detection.  Following this lead, Nokogiri::HTML5 attempts to do likewise,
    # while attempting to more closely follow the HTML5 standard.
    #
    # http://bugs.ruby-lang.org/issues/2567
    # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
    #
    def self.reencode(body, content_type=nil)
      if body.encoding == Encoding::ASCII_8BIT
        encoding = nil

        # look for a Byte Order Mark (BOM)
        initial_bytes = body[0..2].bytes
        if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
          encoding = Encoding::UTF_8
        elsif initial_bytes[0..1] == [0xFE, 0xFF]
          encoding = Encoding::UTF_16BE
        elsif initial_bytes[0..1] == [0xFF, 0xFE]
          encoding = Encoding::UTF_16LE
        end

        # look for a charset in a content-encoding header
        if content_type
          encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
        end

        # look for a charset in a meta tag in the first 1024 bytes
        if not encoding
          data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
          data.scan(/<meta.*?>/m).each do |meta|
            encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
          end
        end

        # if all else fails, default to the official default encoding for HTML
        encoding ||= Encoding::ISO_8859_1

        # change the encoding to match the detected or inferred encoding
        body = body.dup
        begin
          body.force_encoding(encoding)
        rescue ArgumentError
          body.force_encoding(Encoding::ISO_8859_1)
        end
      end

      body.encode(Encoding::UTF_8)
    end

    def self.serialize_node_internal(current_node, io, encoding, options)
      case current_node.type
      when XML::Node::ELEMENT_NODE
        ns = current_node.namespace
        ns_uri = ns.nil? ? nil : ns.href
        # XXX(sfc): attach namespaces to all nodes, even html?
        if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
          tagname = current_node.name
        else
          tagname = "#{ns.prefix}:#{current_node.name}"
        end
        io << '<' << tagname
        current_node.attribute_nodes.each do |attr|
          attr_ns = attr.namespace
          if attr_ns.nil?
            attr_name = attr.name
          else
            ns_uri = attr_ns.href
            if ns_uri == XML_NAMESPACE
              attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
            elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
              attr_name = 'xmlns'
            elsif ns_uri == XMLNS_NAMESPACE
              attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
            elsif ns_uri == XLINK_NAMESPACE
              attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
            else
              attr_name = "#{attr_ns.prefix}:#{attr.name}"
            end
          end
          io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
        end
        io << '>'
        if !%w[area base basefont bgsound br col embed frame hr img input keygen
               link meta param source track wbr].include?(current_node.name)
          io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
          current_node.children.each do |child|
            # XXX(sfc): Templates handled specially?
            serialize_node_internal(child, io, encoding, options)
          end
          io << '</' << tagname << '>'
        end
      when XML::Node::TEXT_NODE
        parent = current_node.parent
        if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
          io << current_node.content
        else
          io << escape_text(current_node.content, encoding, false)
        end
      when XML::Node::CDATA_SECTION_NODE
        io << '<![CDATA[' << current_node.content << ']]>'
      when XML::Node::COMMENT_NODE
        io << '<!--' << current_node.content << '-->'
      when XML::Node::PI_NODE
        io << '<?' << current_node.content << '>'
      when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
          io << '<!DOCTYPE ' << current_node.name << '>'
      when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
        current_node.children.each do |child|
          serialize_node_internal(child, io, encoding, options)
        end
      else
        raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
      end
    end

    def self.escape_text(text, encoding, attribute_mode)
      if attribute_mode
        text = text.gsub(/[&\u00a0"]/,
                           '&' => '&amp;', "\u00a0" => '&nbsp;', '"' => '&quot;')
      else
        text = text.gsub(/[&\u00a0<>]/,
                           '&' => '&amp;', "\u00a0" => '&nbsp;',  '<' => '&lt;', '>' => '&gt;')
      end
      # Not part of the standard
      text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
    end

    def self.prepend_newline?(node)
      return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
      first_child = node.children[0]
      first_child.text? && first_child.content.start_with?("\n")
    end
  end
end