File: test_document_encoding.rb

package info (click to toggle)
ruby-nokogiri 1.6.3.1%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 4,132 kB
  • ctags: 2,605
  • sloc: xml: 26,397; ruby: 14,634; ansic: 4,855; yacc: 244; sh: 122; makefile: 11
file content (148 lines) | stat: -rw-r--r-- 5,096 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
require "helper"

module Nokogiri
  module HTML
    if RUBY_VERSION =~ /^1\.9/
      class TestDocumentEncoding < Nokogiri::TestCase
        def test_encoding
          doc = Nokogiri::HTML File.open(SHIFT_JIS_HTML, 'rb')

          hello = "こんにちは"

          assert_match doc.encoding, doc.to_html
          assert_match hello.encode('Shift_JIS'), doc.to_html
          assert_equal 'Shift_JIS', doc.to_html.encoding.name

          assert_match hello, doc.to_html(:encoding => 'UTF-8')
          assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8')
          assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
        end

        def test_encoding_without_charset
          doc = Nokogiri::HTML File.open(SHIFT_JIS_NO_CHARSET, 'r:cp932:cp932').read

          hello = "こんにちは"

          assert_match hello, doc.content
          assert_match hello, doc.to_html(:encoding => 'UTF-8')
          assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
        end

        def test_default_to_encoding_from_string
          bad_charset = <<-eohtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
</head>
<body>
  <a href="http://tenderlovemaking.com/">blah!</a>
</body>
</html>
          eohtml
          doc = Nokogiri::HTML(bad_charset)
          assert_equal bad_charset.encoding.name, doc.encoding

          doc = Nokogiri.parse(bad_charset)
          assert_equal bad_charset.encoding.name, doc.encoding
        end

        def test_encoding_non_utf8
          orig = '日本語が上手です'
          bin = Encoding::ASCII_8BIT
          [Encoding::Shift_JIS, Encoding::EUC_JP].each do |enc|
            html = <<-eohtml.encode(enc)
<html>
<meta http-equiv="Content-Type" content="text/html; charset=#{enc.name}">
<title xml:lang="ja">#{orig}</title></html>
            eohtml
            text = Nokogiri::HTML.parse(html).at('title').inner_text
            assert_equal(
              orig.encode(enc).force_encoding(bin),
              text.encode(enc).force_encoding(bin)
            )
          end
        end

        def test_encoding_with_a_bad_name
          bad_charset = <<-eohtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
</head>
<body>
  <a href="http://tenderlovemaking.com/">blah!</a>
</body>
</html>
          eohtml
          doc = Nokogiri::HTML(bad_charset, nil, 'askldjfhalsdfjhlkasdfjh')
          assert_equal ['http://tenderlovemaking.com/'],
            doc.css('a').map { |a| a['href'] }
        end
      end
    end

    class TestDocumentEncodingDetection < Nokogiri::TestCase
      if IO.respond_to?(:binread)
        def binread(file)
          IO.binread(file)
        end
      else
        def binread(file)
          IO.read(file)
        end
      end

      def binopen(file)
        File.open(file, 'rb')
      end

      def test_document_html_noencoding
        from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE))
        from_string = Nokogiri::HTML(binread(NOENCODING_FILE))

        assert_equal from_string.to_s.size, from_stream.to_s.size
      end

      def test_document_html_charset
        html = Nokogiri::HTML(binopen(METACHARSET_FILE))
        assert_equal 'iso-2022-jp', html.encoding
        assert_equal 'たこ焼き仮面', html.title
      end

      def test_document_xhtml_enc
        [ENCODING_XHTML_FILE, ENCODING_HTML_FILE].each { |file|
          doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS')
          ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map { |text| text.text }

          doc_from_string = Nokogiri::HTML(binread(file))
          ary_from_string = doc_from_string.xpath('//p/text()').map { |text| text.text }

          doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS')
          ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map { |text| text.text }

          doc_from_file = Nokogiri::HTML(binopen(file))
          ary_from_file = doc_from_file.xpath('//p/text()').map { |text| text.text }

          title = 'たこ焼き仮面'

          assert_equal(title, doc_from_string_enc.at('//title/text()').text)
          assert_equal(title, doc_from_string.at('//title/text()').text)
          assert_equal(title, doc_from_file_enc.at('//title/text()').text)
          unless Nokogiri.jruby? && file == ENCODING_HTML_FILE
            assert_equal(title, doc_from_file.at('//title/text()').text)
          end

          evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' }

          assert_equal(evil, ary_from_string_enc)
          assert_equal(evil, ary_from_string)
          assert_equal(evil, ary_from_file_enc)
          assert_equal(evil, ary_from_file)
        }
      end
    end
  end
end