File: document_fragment.rb

package info (click to toggle)
ruby-nokogiri 1.18.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 8,076 kB
  • sloc: ansic: 38,893; xml: 27,665; ruby: 27,285; java: 15,348; cpp: 7,107; yacc: 244; sh: 208; makefile: 154; sed: 14
file content (166 lines) | stat: -rw-r--r-- 6,425 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# frozen_string_literal: true

module Nokogiri
  module HTML4
    class DocumentFragment < Nokogiri::XML::DocumentFragment
      #
      # :call-seq:
      #   parse(input) { |options| ... } → HTML4::DocumentFragment
      #   parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
      #
      # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
      # method creates a new, empty HTML4::Document to contain the fragment.
      #
      # [Required Parameters]
      # - +input+ (String | IO) The content to be parsed.
      #
      # [Optional Keyword Arguments]
      # - +encoding:+ (String) The name of the encoding that should be used when processing the
      #   document. When not provided, the encoding will be determined based on the document
      #   content.
      #
      # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
      #   behaviors during parsing. See ParseOptions for more information. The default value is
      #   +ParseOptions::DEFAULT_HTML+.
      #
      # [Yields]
      #   If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
      #   can be configured before parsing. See ParseOptions for more information.
      #
      # [Returns] HTML4::DocumentFragment
      #
      # *Example:* Parsing a string
      #
      #   fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
      #
      # *Example:* Parsing an IO
      #
      #   fragment = File.open("fragment.html") do |file|
      #     HTML4::DocumentFragment.parse(file)
      #   end
      #
      # *Example:* Specifying encoding
      #
      #   fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
      #
      # *Example:* Setting parse options dynamically
      #
      #   HTML4::DocumentFragment.parse("<div>Hello World") do |options|
      #     options.huge.pedantic
      #   end
      #
      def self.parse(
        input,
        encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
        encoding: encoding_, options: options_,
        &block
      )
        # TODO: this method should take a context node.
        doc = HTML4::Document.new

        if input.respond_to?(:read)
          # Handle IO-like objects (IO, File, StringIO, etc.)
          # The _read_ method of these objects doesn't accept an +encoding+ parameter.
          # Encoding is usually set when the IO object is created or opened,
          # or by using the _set_encoding_ method.
          #
          # 1. If +encoding+ is provided and the object supports _set_encoding_,
          #    set the encoding before reading.
          # 2. Read the content from the IO-like object.
          #
          # Note: After reading, the content's encoding will be:
          # - The encoding set by _set_encoding_ if it was called
          # - The default encoding of the IO object otherwise
          #
          # For StringIO specifically, _set_encoding_ affects only the internal string,
          # not how the data is read out.
          input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
          input = input.read
        end

        encoding ||= if input.respond_to?(:encoding)
          encoding = input.encoding
          if encoding == ::Encoding::ASCII_8BIT
            "UTF-8"
          else
            encoding.name
          end
        else
          "UTF-8"
        end

        doc.encoding = encoding

        new(doc, input, options: options, &block)
      end

      #
      # :call-seq:
      #   new(document) { |options| ... } → HTML4::DocumentFragment
      #   new(document, input) { |options| ... } → HTML4::DocumentFragment
      #   new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
      #
      # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
      #
      # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
      # than call this method directly.
      #
      # [Required Parameters]
      # - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
      #
      # [Optional Parameters]
      # - +input+ (String) The content to be parsed.
      #
      # [Optional Keyword Arguments]
      # - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
      #   below for more information.
      #
      # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
      #   behaviors during parsing. See ParseOptions for more information. The default value is
      #   +ParseOptions::DEFAULT_HTML+.
      #
      # [Yields]
      #   If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
      #   can be configured before parsing. See ParseOptions for more information.
      #
      # [Returns] HTML4::DocumentFragment
      #
      # === Context \Node
      #
      # If a context node is specified using +context:+, then the fragment will be created by
      # calling XML::Node#parse on that node, so the parser will behave as if that Node is the
      # parent of the fragment subtree.
      #
      def initialize(
        document, input = nil,
        context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
        context: context_, options: options_
      ) # rubocop:disable Lint/MissingSuper
        return self unless input

        options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
        @parse_options = options
        yield options if block_given?

        if context
          preexisting_errors = document.errors.dup
          node_set = context.parse("<div>#{input}</div>", options)
          node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
          self.errors = document.errors - preexisting_errors
        else
          # This is a horrible hack, but I don't care
          path = if /^\s*?<body/i.match?(input)
            "/html/body"
          else
            "/html/body/node()"
          end

          temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
          temp_doc.xpath(path).each { |child| child.parent = self }
          self.errors = temp_doc.errors
        end
        children
      end
    end
  end
end