File: test_document_encoding.rb

package info (click to toggle)
ruby-nokogiri 1.18.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 8,076 kB
  • sloc: ansic: 38,893; xml: 27,665; ruby: 27,285; java: 15,348; cpp: 7,107; yacc: 244; sh: 208; makefile: 154; sed: 14
file content (137 lines) | stat: -rw-r--r-- 5,013 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# frozen_string_literal: true

require "helper"

module Nokogiri
  module XML
    class TestDocumentEncoding < Nokogiri::TestCase
      describe "Nokogiri::XML::Document encoding" do
        let(:shift_jis_document) { Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML) }
        let(:ascii_document) { Nokogiri::XML.parse(File.read(XML_FILE), XML_FILE) }
        let(:utf16_document) do
          # the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
          # is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
          size = 8000
          <<~XML.encode(Encoding::UTF_16)
            <?xml version="1.0" encoding="UTF-16"?>
            <root>
              <bar>#{"A" * size}</bar>
            </root>
          XML
        end

        describe "#encoding" do
          it "describes the document's encoding correctly" do
            assert_equal("Shift_JIS", shift_jis_document.encoding)
          end

          it "applies the specified encoding even if on empty documents" do
            encoding = "Shift_JIS"
            assert_equal(encoding, Nokogiri::XML(nil, nil, encoding).encoding)
          end

          it "applies the specified kwargs encoding even if on empty documents" do
            encoding = "Shift_JIS"
            assert_equal(encoding, Nokogiri::XML(nil, encoding: encoding).encoding)
          end
        end

        describe "#encoding=" do
          it "determines the document's encoding when serialized" do
            ascii_document.encoding = "UTF-8"
            assert_match("encoding=\"UTF-8\"", ascii_document.to_xml)

            ascii_document.encoding = "EUC-JP"
            assert_match("encoding=\"EUC-JP\"", ascii_document.to_xml)
          end
        end

        it "encodes the URL as UTF-8" do
          assert_equal(Encoding::UTF_8, shift_jis_document.url.encoding)
        end

        it "encodes the encoding name as UTF-8" do
          assert_equal(Encoding::UTF_8, shift_jis_document.encoding.encoding)
        end

        it "encodes the library versions as UTF-8" do
          skip_unless_libxml2

          assert_equal(Encoding::UTF_8, Nokogiri::LIBXML_COMPILED_VERSION.encoding)
          assert_equal(Encoding::UTF_8, Nokogiri::LIBXSLT_COMPILED_VERSION.encoding)
        end

        it "parses and serializes UTF-16 correctly" do
          xml = <<~XML.encode(Encoding::UTF_16)
            <?xml version="1.0" encoding="UTF-16"?>
            <root><bar>A</bar></root>
          XML
          output = Nokogiri::XML(xml).to_xml
          output_doc = Nokogiri::XML(output)

          # these are descriptive, not prescriptive. the difference is whitespace. this may change
          # as implementations change. the intention is to verify that they're _roughly_ the right
          # length, they're not zero or half-width or double-width.
          expected_bytesize = Nokogiri.jruby? ? 132 : 142

          assert_equal(Encoding::UTF_16, output.encoding)
          assert_equal("UTF-16", output_doc.encoding)
          assert_equal(expected_bytesize, output.bytesize)
          output_doc.at_xpath("/root/bar/text()").tap do |node|
            assert(node, "unexpected DOM structure in #{output.inspect}")
            assert_equal("A", node.content)
          end
        end

        it "serializes UTF-16 correctly across libxml2 buffer flushes" do
          # https://github.com/sparklemotion/nokogiri/issues/752
          skip_unless_libxml2

          output = Nokogiri::XML(utf16_document).to_xml

          assert_equal(Encoding::UTF_16, output.encoding)
          assert_equal(utf16_document.bytesize, output.bytesize)
        end

        describe "pseudo-IO" do
          it "serializes correctly with Zip::OutputStream objects" do
            # https://github.com/sparklemotion/nokogiri/issues/2773
            begin
              require "zip"
            rescue LoadError
              skip("rubyzip is not installed")
            end

            xml = <<~XML
              <?xml version="1.0" encoding="UTF-8"?>
              <root>
                <bar>A</bar>
              </root>
            XML

            Dir.mktmpdir do |tmpdir|
              zipfile_path = File.join(tmpdir, "test.zip")

              Zip::OutputStream.open(zipfile_path) do |io|
                io.put_next_entry("test-utf8.xml")
                Nokogiri::XML(xml).write_to(io, encoding: "UTF-8")
              end

              Zip::InputStream.open(zipfile_path) do |io|
                entry = io.get_next_entry
                assert_equal("test-utf8.xml", entry.name)
                output = io.read

                # no final newline on jruby. descriptive, not prescriptive.
                expected_length = Nokogiri.jruby? ? xml.bytesize - 1 : xml.bytesize

                assert_equal(Encoding::UTF_8, output.encoding)
                assert_equal(expected_length, output.bytesize)
              end
            end
          end
        end
      end
    end
  end
end