File: encoding_detector_test.rb

package info (click to toggle)
ruby-charlock-holmes 0.6.9.4.dfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 304 kB
  • ctags: 94
  • sloc: ruby: 361; ansic: 264; lisp: 237; cpp: 101; sh: 21; makefile: 2
file content (119 lines) | stat: -rw-r--r-- 4,049 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# encoding: utf-8
require File.expand_path("../helper", __FILE__)

class EncodingDetectorTest < MiniTest::Unit::TestCase
  def setup
    @detector = CharlockHolmes::EncodingDetector.new
  end

  def test_has_class_level_detect_method
    CharlockHolmes::EncodingDetector.respond_to? :detect
    detected = CharlockHolmes::EncodingDetector.detect 'test'
    assert_equal 'ISO-8859-1', detected[:encoding]
  end

  def test_class_level_detect_accepts_encoding_hint
    CharlockHolmes::EncodingDetector.respond_to? :detect
    detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
    assert_equal 'ISO-8859-1', detected[:encoding]
  end

  def test_has_class_level_detect_all_method
    CharlockHolmes::EncodingDetector.respond_to? :detect_all
    detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
    assert detected_list.is_a? Array

    encoding_list = detected_list.map {|d| d[:encoding]}.sort
    assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
  end

  def test_class_level_detect_all_method_accepts_encoding_hint
    CharlockHolmes::EncodingDetector.respond_to? :detect_all
    detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
    assert detected_list.is_a? Array

    encoding_list = detected_list.map {|d| d[:encoding]}.sort
    assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
  end

  def test_has_detect_method
    @detector.respond_to? :detect
    detected = @detector.detect 'test'
    assert_equal 'ISO-8859-1', detected[:encoding]
  end

  def test_detect_accepts_encoding_hint
    @detector.respond_to? :detect
    detected = @detector.detect 'test', 'UTF-8'
    assert_equal 'ISO-8859-1', detected[:encoding]
  end

  def test_has_detect_all_method
    @detector.respond_to? :detect_all
    detected_list = @detector.detect_all 'test'
    assert detected_list.is_a? Array

    encoding_list = detected_list.map {|d| d[:encoding]}.sort
    assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
  end

  def test_detect_all_accepts_encoding_hint
    @detector.respond_to? :detect_all
    detected_list = @detector.detect_all 'test', 'UTF-8'
    assert detected_list.is_a? Array

    encoding_list = detected_list.map {|d| d[:encoding]}.sort
    assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
  end

  def test_strip_tags_flag
    detector = CharlockHolmes::EncodingDetector.new
    detector.strip_tags = true
    assert detector.strip_tags

    detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
    assert_equal 'UTF-8', detection[:encoding]

    detector.strip_tags = false
    assert !detector.strip_tags

    detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
    assert_equal 'UTF-8', detection[:encoding]
  end

  def test_has_list_of_supported_encodings
    CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
    supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings

    assert supported_encodings.is_a?(Array)
    assert supported_encodings.include? 'UTF-8'
  end

  MAPPING = [
    ['repl2.cljs',                'ISO-8859-1', :text],
    ['core.rkt',                  'UTF-8',      :text],
    ['cl-messagepack.lisp',       'ISO-8859-1', :text],
    ['TwigExtensionsDate.es.yml', 'UTF-8',      :text],
    ['AnsiGraph.psm1',            'UTF-16LE',   :text],
    ['laholator.py',              'UTF-8',      :text],
    ['hello_world',               nil,          :binary]
  ]

  def test_detection_works_as_expected
    MAPPING.each do |mapping|
      file, encoding, type = mapping

      path = File.expand_path "../fixtures/#{file}", __FILE__
      content = File.read path
      guessed = @detector.detect content

      assert_equal encoding, guessed[:encoding]
      assert_equal type, guessed[:type]

      if content.respond_to?(:force_encoding) && guessed[:type] == :text
        content.force_encoding guessed[:encoding]
        assert content.valid_encoding?
      end
    end
  end
end