File: test_text_misc.py

package info (click to toggle)
kitchen 1.2.6-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,284 kB
  • sloc: python: 10,749; makefile: 22; sh: 4
file content (153 lines) | stat: -rw-r--r-- 7,945 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
#
import unittest
from nose import tools
from nose.plugins.skip import SkipTest

try:
    import chardet
except ImportError:
    chardet = None

from kitchen.text import misc
from kitchen.text.exceptions import ControlCharError
from kitchen.text.converters import to_unicode

import base_classes

class TestTextMisc(unittest.TestCase, base_classes.UnicodeTestData):
    def test_guess_encoding_no_chardet(self):
        # Test that unicode strings are not allowed
        tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish)

        tools.ok_(misc.guess_encoding(self.utf8_spanish, disable_chardet=True) == 'utf-8')
        tools.ok_(misc.guess_encoding(self.latin1_spanish, disable_chardet=True) == 'latin-1')
        tools.ok_(misc.guess_encoding(self.utf8_japanese, disable_chardet=True) == 'utf-8')
        tools.ok_(misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) == 'latin-1')

    def test_guess_encoding_with_chardet(self):
        # We go this slightly roundabout way because multiple encodings can
        # output the same byte sequence.  What we're really interested in is
        # if we can get the original unicode string without knowing the
        # converters beforehand
        tools.ok_(to_unicode(self.utf8_spanish,
            misc.guess_encoding(self.utf8_spanish)) == self.u_spanish)
        tools.ok_(to_unicode(self.latin1_spanish,
            misc.guess_encoding(self.latin1_spanish)) == self.u_spanish)
        tools.ok_(to_unicode(self.utf8_japanese,
            misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)

    def test_guess_encoding_with_chardet_installed(self):
        if chardet:
            tools.ok_(to_unicode(self.euc_jp_japanese,
                misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese)
        else:
            raise SkipTest('chardet not installed, euc_jp will not be guessed correctly')

    def test_guess_encoding_with_chardet_uninstalled(self):
        if chardet:
            raise SkipTest('chardet installed, euc_jp will not be mangled')
        else:
            tools.ok_(to_unicode(self.euc_jp_japanese,
                misc.guess_encoding(self.euc_jp_japanese)) ==
                self.u_mangled_euc_jp_as_latin1)

    def test_str_eq(self):
        # str vs str:
        tools.ok_(misc.str_eq(self.euc_jp_japanese, self.euc_jp_japanese) == True)
        tools.ok_(misc.str_eq(self.utf8_japanese, self.utf8_japanese) == True)
        tools.ok_(misc.str_eq(self.b_ascii, self.b_ascii) == True)
        tools.ok_(misc.str_eq(self.euc_jp_japanese, self.latin1_spanish) == False)
        tools.ok_(misc.str_eq(self.utf8_japanese, self.euc_jp_japanese) == False)
        tools.ok_(misc.str_eq(self.b_ascii, self.b_ascii[:-2]) == False)

        # unicode vs unicode:
        tools.ok_(misc.str_eq(self.u_japanese, self.u_japanese) == True)
        tools.ok_(misc.str_eq(self.u_ascii, self.u_ascii) == True)
        tools.ok_(misc.str_eq(self.u_japanese, self.u_spanish) == False)
        tools.ok_(misc.str_eq(self.u_ascii, self.u_ascii[:-2]) == False)

        # unicode vs str with default utf-8 conversion:
        tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese) == True)
        tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii) == True)
        tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese) == False)
        tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii[:-2]) == False)

        # unicode vs str with explicit encodings:
        tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese, encoding='euc_jp') == True)
        tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='utf8') == True)
        tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii, encoding='latin1') == True)
        tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese, encoding='latin1') == False)
        tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='euc_jp') == False)
        tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='euc_jp') == False)
        tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii[:-2], encoding='latin1') == False)

        # str vs unicode (reverse parameter order of unicode vs str)
        tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese) == True)
        tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii) == True)
        tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese) == False)
        tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii[:-2]) == False)

        tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese, encoding='euc_jp') == True)
        tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='utf8') == True)
        tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii, encoding='latin1') == True)
        tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese, encoding='latin1') == False)
        tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='euc_jp') == False)
        tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='euc_jp') == False)
        tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii[:-2], encoding='latin1') == False)


    def test_process_control_chars(self):
        tools.assert_raises(TypeError, misc.process_control_chars, 'byte string')
        tools.assert_raises(ControlCharError, misc.process_control_chars,
                *[self.u_ascii_chars], **{'strategy': 'strict'})
        tools.ok_(misc.process_control_chars(self.u_ascii_chars,
            strategy='ignore') == self.u_ascii_no_ctrl)
        tools.ok_(misc.process_control_chars(self.u_ascii_chars,
            strategy='replace') == self.u_ascii_ctrl_replace)

    def test_html_entities_unescape(self):
        tools.assert_raises(TypeError, misc.html_entities_unescape, 'byte string')
        tools.ok_(misc.html_entities_unescape(self.u_entity_escape) == self.u_entity)
        tools.ok_(misc.html_entities_unescape(u'<tag>%s</tag>'
            % self.u_entity_escape) == self.u_entity)
        tools.ok_(misc.html_entities_unescape(u'a&#1234567890;b') == u'a&#1234567890;b')
        tools.ok_(misc.html_entities_unescape(u'a&#xfffd;b') == u'a\ufffdb')
        tools.ok_(misc.html_entities_unescape(u'a&#65533;b') == u'a\ufffdb')

    def test_byte_string_valid_xml(self):
        tools.ok_(misc.byte_string_valid_xml(u'unicode string') == False)

        tools.ok_(misc.byte_string_valid_xml(self.utf8_japanese))
        tools.ok_(misc.byte_string_valid_xml(self.euc_jp_japanese, 'euc_jp'))

        tools.ok_(misc.byte_string_valid_xml(self.utf8_japanese, 'euc_jp') == False)
        tools.ok_(misc.byte_string_valid_xml(self.euc_jp_japanese, 'utf8') == False)

        tools.ok_(misc.byte_string_valid_xml(self.utf8_ascii_chars) == False)

    def test_byte_string_valid_encoding(self):
        '''Test that a byte sequence is validated'''
        tools.ok_(misc.byte_string_valid_encoding(self.utf8_japanese) == True)
        tools.ok_(misc.byte_string_valid_encoding(self.euc_jp_japanese, encoding='euc_jp') == True)

    def test_byte_string_invalid_encoding(self):
        '''Test that we return False with non-encoded chars'''
        tools.ok_(misc.byte_string_valid_encoding('\xff') == False)
        tools.ok_(misc.byte_string_valid_encoding(self.euc_jp_japanese) == False)

class TestIsStringTypes(unittest.TestCase):
    def test_isbasestring(self):
        tools.assert_true(misc.isbasestring('abc'))
        tools.assert_true(misc.isbasestring(u'abc'))
        tools.assert_false(misc.isbasestring(5))

    def test_isbytestring(self):
        tools.assert_true(misc.isbytestring('abc'))
        tools.assert_false(misc.isbytestring(u'abc'))
        tools.assert_false(misc.isbytestring(5))

    def test_isunicodestring(self):
        tools.assert_false(misc.isunicodestring('abc'))
        tools.assert_true(misc.isunicodestring(u'abc'))
        tools.assert_false(misc.isunicodestring(5))