File: document.rb

package info (click to toggle)
ruby-nokogiri 1.18.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 8,076 kB
  • sloc: ansic: 38,893; xml: 27,665; ruby: 27,285; java: 15,348; cpp: 7,107; yacc: 244; sh: 208; makefile: 154; sed: 14
file content (235 lines) | stat: -rw-r--r-- 8,108 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# coding: utf-8
# frozen_string_literal: true

require "pathname"

module Nokogiri
  module HTML4
    class Document < Nokogiri::XML::Document
      ###
      # Get the meta tag encoding for this document.  If there is no meta tag,
      # then nil is returned.
      def meta_encoding
        if (meta = at_xpath("//meta[@charset]"))
          meta[:charset]
        elsif (meta = meta_content_type)
          meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
        end
      end

      ###
      # Set the meta tag encoding for this document.
      #
      # If an meta encoding tag is already present, its content is
      # replaced with the given text.
      #
      # Otherwise, this method tries to create one at an appropriate
      # place supplying head and/or html elements as necessary, which
      # is inside a head element if any, and before any text node or
      # content element (typically <body>) if any.
      #
      # The result when trying to set an encoding that is different
      # from the document encoding is undefined.
      #
      # Beware in CRuby, that libxml2 automatically inserts a meta tag
      # into a head element.
      def meta_encoding=(encoding)
        if (meta = meta_content_type)
          meta["content"] = format("text/html; charset=%s", encoding)
          encoding
        elsif (meta = at_xpath("//meta[@charset]"))
          meta["charset"] = encoding
        else
          meta = XML::Node.new("meta", self)
          if (dtd = internal_subset) && dtd.html5_dtd?
            meta["charset"] = encoding
          else
            meta["http-equiv"] = "Content-Type"
            meta["content"] = format("text/html; charset=%s", encoding)
          end

          if (head = at_xpath("//head"))
            head.prepend_child(meta)
          else
            set_metadata_element(meta)
          end
          encoding
        end
      end

      def meta_content_type
        xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
          node["http-equiv"] =~ /\AContent-Type\z/i
        end
      end
      private :meta_content_type

      ###
      # Get the title string of this document.  Return nil if there is
      # no title tag.
      def title
        (title = at_xpath("//title")) && title.inner_text
      end

      ###
      # Set the title string of this document.
      #
      # If a title element is already present, its content is replaced
      # with the given text.
      #
      # Otherwise, this method tries to create one at an appropriate
      # place supplying head and/or html elements as necessary, which
      # is inside a head element if any, right after a meta
      # encoding/charset tag if any, and before any text node or
      # content element (typically <body>) if any.
      def title=(text)
        tnode = XML::Text.new(text, self)
        if (title = at_xpath("//title"))
          title.children = tnode
          return text
        end

        title = XML::Node.new("title", self) << tnode
        if (head = at_xpath("//head"))
          head << title
        elsif (meta = at_xpath("//meta[@charset]") || meta_content_type)
          # better put after charset declaration
          meta.add_next_sibling(title)
        else
          set_metadata_element(title)
        end
      end

      def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
        if (head = at_xpath("//head"))
          head << element
        elsif (html = at_xpath("//html"))
          head = html.prepend_child(XML::Node.new("head", self))
          head.prepend_child(element)
        elsif (first = children.find do |node|
                 case node
                 when XML::Element, XML::Text
                   true
                 end
               end)
          # We reach here only if the underlying document model
          # allows <html>/<head> elements to be omitted and does not
          # automatically supply them.
          first.add_previous_sibling(element)
        else
          html = add_child(XML::Node.new("html", self))
          head = html.add_child(XML::Node.new("head", self))
          head.prepend_child(element)
        end
      end
      private :set_metadata_element

      ####
      # Serialize Node using +options+. Save options can also be set using a block.
      #
      # See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
      #
      # These two statements are equivalent:
      #
      #  node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
      #
      # or
      #
      #   node.serialize(:encoding => 'UTF-8') do |config|
      #     config.format.as_xml
      #   end
      #
      def serialize(options = {})
        options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
        super
      end

      ####
      # Create a Nokogiri::XML::DocumentFragment from +tags+
      def fragment(tags = nil)
        DocumentFragment.new(self, tags, root)
      end

      # :call-seq:
      #   xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
      #
      # [Returns] The document type which determines CSS-to-XPath translation.
      #
      # See XPathVisitor for more information.
      def xpath_doctype
        Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
      end

      class << self
        # :call-seq:
        #   parse(input) { |options| ... } => Nokogiri::HTML4::Document
        #   parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
        #
        # Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
        #
        # [Required Parameters]
        # - +input+ (String | IO) The content to be parsed.
        #
        # [Optional Keyword Arguments]
        # - +url:+ (String) The base URI for this document.
        #
        # - +encoding:+ (String) The name of the encoding that should be used when processing the
        #   document. When not provided, the encoding will be determined based on the document
        #   content.
        #
        # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
        #   behaviors during parsing. See ParseOptions for more information. The default value is
        #   +ParseOptions::DEFAULT_HTML+.
        #
        # [Yields]
        #   If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
        #   can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
        #
        # [Returns] Nokogiri::HTML4::Document
        def parse(
          input,
          url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
          url: url_, encoding: encoding_, options: options_
        )
          options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
          yield options if block_given?

          url ||= input.respond_to?(:path) ? input.path : nil

          if input.respond_to?(:encoding)
            unless input.encoding == Encoding::ASCII_8BIT
              encoding ||= input.encoding.name
            end
          end

          if input.respond_to?(:read)
            if input.is_a?(Pathname)
              # resolve the Pathname to the file and open it as an IO object, see #2110
              input = input.expand_path.open
              url ||= input.path
            end

            unless encoding
              input = EncodingReader.new(input)
              begin
                return read_io(input, url, encoding, options.to_i)
              rescue EncodingReader::EncodingFound => e
                encoding = e.found_encoding
              end
            end
            return read_io(input, url, encoding, options.to_i)
          end

          # read_memory pukes on empty docs
          if input.nil? || input.empty?
            return encoding ? new.tap { |i| i.encoding = encoding } : new
          end

          encoding ||= EncodingReader.detect_encoding(input)

          read_memory(input, url, encoding, options.to_i)
        end
      end
    end
  end
end