File: test_parser.rb

package info (click to toggle)
ruby-nokogiri 1.18.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 8,076 kB
  • sloc: ansic: 38,893; xml: 27,665; ruby: 27,285; java: 15,348; cpp: 7,107; yacc: 244; sh: 208; makefile: 154; sed: 14
file content (273 lines) | stat: -rw-r--r-- 9,198 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# -*- coding: utf-8 -*-
# frozen_string_literal: true

require "helper"

module Nokogiri
  module SAX
    class TestCase
      describe Nokogiri::HTML4::SAX::Parser do
        let(:parser) { Nokogiri::HTML4::SAX::Parser.new(Doc.new) }

        it "raises an error on empty content" do
          e = assert_raises(RuntimeError) { parser.parse("") }
          assert_equal("input string cannot be empty", e.message)
        end

        it "parse_empty_file" do
          # Make sure empty files don't break stuff
          empty_file_name = File.join(ASSETS_DIR, "bogus.xml")

          refute_raises do
            parser.parse_file(empty_file_name)
          end
        end

        it "parse_file" do
          parser.parse_file(HTML_FILE)

          # Take a look at the comment in test_parse_document to know
          # a possible reason to this difference.
          if Nokogiri.uses_libxml?
            assert_equal(1111, parser.document.end_elements.length)
          else
            assert_equal(1120, parser.document.end_elements.length)
          end
        end

        it "parse_file_nil_argument" do
          assert_raises(ArgumentError) do
            parser.parse_file(nil)
          end
        end

        it "parse_file_non_existent" do
          assert_raises(Errno::ENOENT) do
            parser.parse_file("there_is_no_reasonable_way_this_file_exists")
          end
        end

        it "parse_file_with_dir" do
          assert_raises(Errno::EISDIR) do
            parser.parse_file(File.dirname(__FILE__))
          end
        end

        it "parse_memory_nil" do
          assert_raises(TypeError) do
            parser.parse_memory(nil)
          end
        end

        describe "encoding" do
          let(:html_encoding_iso8859) { <<~HTML }
            <html><meta charset="ISO-8859-1">
            <body>B\xF6hnhardt</body>
          HTML

          # this input string is really UTF-8 but is marked as ISO-8859-1
          let(:html_encoding_broken) { <<~HTML }
            <html><meta charset="ISO-8859-1">
            <body>Böhnhardt</body>
          HTML

          # this input string is really ISO-8859-1 but is marked as UTF-8
          let(:html_encoding_broken2) { <<~HTML }
            <html><meta charset="UTF-8">
            <body>B\xF6hnhardt</body>
          HTML

          it "is nil by default to indicate encoding should be autodetected" do
            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            assert_nil(parser.encoding)
          end

          it "can be set in the initializer" do
            assert_equal("UTF-8", Nokogiri::HTML4::SAX::Parser.new(Doc.new, "UTF-8").encoding)
            assert_equal("ISO-2022-JP", Nokogiri::HTML4::SAX::Parser.new(Doc.new, "ISO-2022-JP").encoding)
          end

          it "raises when given an invalid encoding name" do
            assert_raises(ArgumentError) do
              Nokogiri::HTML4::SAX::Parser.new(Doc.new, "not an encoding").parse_io(StringIO.new("<root/>"))
            end
            assert_raises(ArgumentError) do
              Nokogiri::HTML4::SAX::Parser.new(Doc.new, "not an encoding").parse_memory("<root/>")
            end
            assert_raises(ArgumentError) { parser.parse_io(StringIO.new("<root/>"), "not an encoding") }
            assert_raises(ArgumentError) { parser.parse_memory("<root/>", "not an encoding") }
          end

          it "autodetects the encoding if not overridden" do
            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse(html_encoding_iso8859)

            # correctly converted the input ISO-8859-1 to UTF-8 for the callback
            assert_equal("Böhnhardt", parser.document.data.join.strip)
          end

          it "overrides the ISO-8859-1 document's encoding when set via initializer" do
            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse_memory(html_encoding_broken)

            assert_equal("Böhnhardt", parser.document.data.join.strip)

            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new, "UTF-8")
            parser.parse_memory(html_encoding_broken)

            assert_equal("Böhnhardt", parser.document.data.join.strip)
          end

          it "overrides the UTF-8 document's encoding when set via initializer" do
            if Nokogiri.uses_libxml?(">= 2.13.0") # nekohtml is a better guesser than libxml2
              parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
              parser.parse_memory(html_encoding_broken2)

              assert(parser.document.errors.any? { |e| e.match(/Invalid byte/) })
            end

            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse_memory(html_encoding_broken2, "ISO-8859-1")

            assert_equal("Böhnhardt", parser.document.data.join.strip)
            refute(parser.document.errors.any? { |e| e.match(/Invalid byte/) })
          end

          it "can be set via parse_io" do
            if Nokogiri.uses_libxml?("< 2.13.0")
              skip("older libxml2 encoding detection is sus")
            end

            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse_io(StringIO.new(html_encoding_broken), "UTF-8")

            assert_equal("Böhnhardt", parser.document.data.join.strip)

            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse_io(StringIO.new(html_encoding_broken2), "ISO-8859-1")

            assert_equal("Böhnhardt", parser.document.data.join.strip)
          end

          it "can be set via parse_memory" do
            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse_memory(html_encoding_broken, "UTF-8")

            assert_equal("Böhnhardt", parser.document.data.join.strip)

            parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
            parser.parse_memory(html_encoding_broken2, "ISO-8859-1")

            assert_equal("Böhnhardt", parser.document.data.join.strip)
          end
        end

        it "parse_document" do
          parser.parse_memory(<<~HTML)
            <p>Paragraph 1</p>
            <p>Paragraph 2</p>
          HTML

          # JRuby version is different because of the internal implementation
          # JRuby version uses NekoHTML which inserts empty "head" elements.
          #
          # Currently following features are set:
          # "http://cyberneko.org/html/properties/names/elems" => "lower"
          # "http://cyberneko.org/html/properties/names/attrs" => "lower"
          if Nokogiri.uses_libxml?
            assert_equal(
              [["html", []], ["body", []], ["p", []], ["p", []]],
              parser.document.start_elements,
            )
          else
            assert_equal(
              [["html", []], ["head", []], ["body", []], ["p", []], ["p", []]],
              parser.document.start_elements,
            )
          end
        end

        it "parser_attributes" do
          html = <<~eohtml
            <html>
              <head>
                <title>hello</title>
              </head>
            <body>
              <img src="face.jpg" title="daddy &amp; me">
              <hr noshade size="2">
            </body>
            </html>
          eohtml

          block_called = false
          parser.parse(html) do |ctx|
            block_called = true
            ctx.replace_entities = true
          end

          assert(block_called)

          noshade_value = ["noshade", nil]

          assert_equal(
            [
              ["html", []],
              ["head", []],
              ["title", []],
              ["body", []],
              ["img", [
                ["src", "face.jpg"],
                ["title", "daddy & me"],
              ],],
              ["hr", [
                noshade_value,
                ["size", "2"],
              ],],
            ],
            parser.document.start_elements,
          )
        end

        let(:html_with_br_tag) { <<~HTML }
          <html>
            <head></head>
            <body>
              <div>
                hello
                <br>
              </div>

              <div>
                hello again
              </div>
            </body>
          </html>
        HTML

        it "parsing_dom_error_from_string" do
          parser.parse(html_with_br_tag)
          assert_equal(6, parser.document.start_elements.length)
        end

        it "parsing_dom_error_from_io" do
          parser.parse(StringIO.new(html_with_br_tag))
          assert_equal(6, parser.document.start_elements.length)
        end

        it "empty_processing_instruction" do
          # https://github.com/sparklemotion/nokogiri/issues/845
          refute_raises do
            parser.parse_memory("<strong>this will segfault<?strong>")
          end
        end

        it "handles invalid types gracefully" do
          assert_raises(TypeError) { Nokogiri::HTML4::SAX::Parser.new.parse(0xcafecafe) }
          assert_raises(TypeError) { Nokogiri::HTML4::SAX::Parser.new.parse_memory(0xcafecafe) }
          assert_raises(TypeError) { Nokogiri::HTML4::SAX::Parser.new.parse_io(0xcafecafe) }
        end
      end
    end
  end
end