File: test_document_encoding.rb

package info (click to toggle)
ruby-nokogiri 1.10.0%2Bdfsg1-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 5,088 kB
  • sloc: xml: 28,081; ruby: 16,687; java: 13,293; ansic: 4,954; yacc: 265; sh: 76; makefile: 19
file content (143 lines) | stat: -rw-r--r-- 4,865 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
require "helper"

module Nokogiri
  module HTML
    class TestDocumentEncoding < Nokogiri::TestCase
      def test_encoding
        doc = Nokogiri::HTML File.open(SHIFT_JIS_HTML, 'rb')

        hello = "こんにちは"

        assert_match doc.encoding, doc.to_html
        assert_match hello.encode('Shift_JIS'), doc.to_html
        assert_equal 'Shift_JIS', doc.to_html.encoding.name

        assert_match hello, doc.to_html(:encoding => 'UTF-8')
        assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8')
        assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
      end

      def test_encoding_without_charset
        doc = Nokogiri::HTML File.open(SHIFT_JIS_NO_CHARSET, 'r:cp932:cp932').read

        hello = "こんにちは"

        assert_match hello, doc.content
        assert_match hello, doc.to_html(:encoding => 'UTF-8')
        assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
      end

      def test_default_to_encoding_from_string
        bad_charset = <<-eohtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
</head>
<body>
  <a href="http://tenderlovemaking.com/">blah!</a>
</body>
</html>
        eohtml
        doc = Nokogiri::HTML(bad_charset)
        assert_equal bad_charset.encoding.name, doc.encoding

        doc = Nokogiri.parse(bad_charset)
        assert_equal bad_charset.encoding.name, doc.encoding
      end

      def test_encoding_non_utf8
        orig = '日本語が上手です'
        bin = Encoding::ASCII_8BIT
        [Encoding::Shift_JIS, Encoding::EUC_JP].each do |enc|
          html = <<-eohtml.encode(enc)
<html>
<meta http-equiv="Content-Type" content="text/html; charset=#{enc.name}">
<title xml:lang="ja">#{orig}</title></html>
          eohtml
          text = Nokogiri::HTML.parse(html).at('title').inner_text
          assert_equal(
            orig.encode(enc).force_encoding(bin),
            text.encode(enc).force_encoding(bin)
          )
        end
      end

      def test_encoding_with_a_bad_name
        bad_charset = <<-eohtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
</head>
<body>
  <a href="http://tenderlovemaking.com/">blah!</a>
</body>
</html>
        eohtml
        doc = Nokogiri::HTML(bad_charset, nil, 'askldjfhalsdfjhlkasdfjh')
        assert_equal ['http://tenderlovemaking.com/'],
          doc.css('a').map { |a| a['href'] }
      end

      def test_empty_doc_encoding
        encoding = 'US-ASCII'
        assert_equal encoding, Nokogiri::HTML.parse(nil, nil, encoding).encoding
      end
    end

    class TestDocumentEncodingDetection < Nokogiri::TestCase
      def binread(file)
        IO.binread(file)
      end

      def binopen(file)
        File.open(file, 'rb')
      end

      def test_document_html_noencoding
        from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE))
        from_string = Nokogiri::HTML(binread(NOENCODING_FILE))

        assert_equal from_string.to_s.size, from_stream.to_s.size
      end

      def test_document_html_charset
        html = Nokogiri::HTML(binopen(METACHARSET_FILE))
        assert_equal 'iso-2022-jp', html.encoding
        assert_equal 'たこ焼き仮面', html.title
      end

      def test_document_xhtml_enc
        [ENCODING_XHTML_FILE, ENCODING_HTML_FILE].each { |file|
          doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS')
          ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map(&:text)

          doc_from_string = Nokogiri::HTML(binread(file))
          ary_from_string = doc_from_string.xpath('//p/text()').map(&:text)

          doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS')
          ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map(&:text)

          doc_from_file = Nokogiri::HTML(binopen(file))
          ary_from_file = doc_from_file.xpath('//p/text()').map(&:text)

          title = 'たこ焼き仮面'

          assert_equal(title, doc_from_string_enc.at('//title/text()').text)
          assert_equal(title, doc_from_string.at('//title/text()').text)
          assert_equal(title, doc_from_file_enc.at('//title/text()').text)
          assert_equal(title, doc_from_file.at('//title/text()').text)

          evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' }

          assert_equal(evil, ary_from_string_enc)
          assert_equal(evil, ary_from_string)
          assert_equal(evil, ary_from_file_enc)
          assert_equal(evil, ary_from_file)
        }
      end
    end
  end
end