1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
|
# -*- coding: utf-8 -*-
#
import unittest
from nose import tools
from nose.plugins.skip import SkipTest
try:
import chardet
except ImportError:
chardet = None
from kitchen.text import misc
from kitchen.text.exceptions import ControlCharError
from kitchen.text.converters import to_unicode
import base_classes
class TestTextMisc(unittest.TestCase, base_classes.UnicodeTestData):
def test_guess_encoding_no_chardet(self):
# Test that unicode strings are not allowed
tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish)
tools.ok_(misc.guess_encoding(self.utf8_spanish, disable_chardet=True) == 'utf-8')
tools.ok_(misc.guess_encoding(self.latin1_spanish, disable_chardet=True) == 'latin-1')
tools.ok_(misc.guess_encoding(self.utf8_japanese, disable_chardet=True) == 'utf-8')
tools.ok_(misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) == 'latin-1')
def test_guess_encoding_with_chardet(self):
# We go this slightly roundabout way because multiple encodings can
# output the same byte sequence. What we're really interested in is
# if we can get the original unicode string without knowing the
# converters beforehand
tools.ok_(to_unicode(self.utf8_spanish,
misc.guess_encoding(self.utf8_spanish)) == self.u_spanish)
tools.ok_(to_unicode(self.latin1_spanish,
misc.guess_encoding(self.latin1_spanish)) == self.u_spanish)
tools.ok_(to_unicode(self.utf8_japanese,
misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)
def test_guess_encoding_with_chardet_installed(self):
if chardet:
tools.ok_(to_unicode(self.euc_jp_japanese,
misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese)
else:
raise SkipTest('chardet not installed, euc_jp will not be guessed correctly')
def test_guess_encoding_with_chardet_uninstalled(self):
if chardet:
raise SkipTest('chardet installed, euc_jp will not be mangled')
else:
tools.ok_(to_unicode(self.euc_jp_japanese,
misc.guess_encoding(self.euc_jp_japanese)) ==
self.u_mangled_euc_jp_as_latin1)
def test_str_eq(self):
# str vs str:
tools.ok_(misc.str_eq(self.euc_jp_japanese, self.euc_jp_japanese) == True)
tools.ok_(misc.str_eq(self.utf8_japanese, self.utf8_japanese) == True)
tools.ok_(misc.str_eq(self.b_ascii, self.b_ascii) == True)
tools.ok_(misc.str_eq(self.euc_jp_japanese, self.latin1_spanish) == False)
tools.ok_(misc.str_eq(self.utf8_japanese, self.euc_jp_japanese) == False)
tools.ok_(misc.str_eq(self.b_ascii, self.b_ascii[:-2]) == False)
# unicode vs unicode:
tools.ok_(misc.str_eq(self.u_japanese, self.u_japanese) == True)
tools.ok_(misc.str_eq(self.u_ascii, self.u_ascii) == True)
tools.ok_(misc.str_eq(self.u_japanese, self.u_spanish) == False)
tools.ok_(misc.str_eq(self.u_ascii, self.u_ascii[:-2]) == False)
# unicode vs str with default utf-8 conversion:
tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese) == True)
tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii) == True)
tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese) == False)
tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii[:-2]) == False)
# unicode vs str with explicit encodings:
tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese, encoding='euc_jp') == True)
tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='utf8') == True)
tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii, encoding='latin1') == True)
tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese, encoding='latin1') == False)
tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='euc_jp') == False)
tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='euc_jp') == False)
tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii[:-2], encoding='latin1') == False)
# str vs unicode (reverse parameter order of unicode vs str)
tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese) == True)
tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii) == True)
tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese) == False)
tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii[:-2]) == False)
tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese, encoding='euc_jp') == True)
tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='utf8') == True)
tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii, encoding='latin1') == True)
tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese, encoding='latin1') == False)
tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='euc_jp') == False)
tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='euc_jp') == False)
tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii[:-2], encoding='latin1') == False)
def test_process_control_chars(self):
tools.assert_raises(TypeError, misc.process_control_chars, 'byte string')
tools.assert_raises(ControlCharError, misc.process_control_chars,
*[self.u_ascii_chars], **{'strategy': 'strict'})
tools.ok_(misc.process_control_chars(self.u_ascii_chars,
strategy='ignore') == self.u_ascii_no_ctrl)
tools.ok_(misc.process_control_chars(self.u_ascii_chars,
strategy='replace') == self.u_ascii_ctrl_replace)
def test_html_entities_unescape(self):
tools.assert_raises(TypeError, misc.html_entities_unescape, 'byte string')
tools.ok_(misc.html_entities_unescape(self.u_entity_escape) == self.u_entity)
tools.ok_(misc.html_entities_unescape(u'<tag>%s</tag>'
% self.u_entity_escape) == self.u_entity)
tools.ok_(misc.html_entities_unescape(u'a�b') == u'a�b')
tools.ok_(misc.html_entities_unescape(u'a�b') == u'a\ufffdb')
tools.ok_(misc.html_entities_unescape(u'a�b') == u'a\ufffdb')
def test_byte_string_valid_xml(self):
tools.ok_(misc.byte_string_valid_xml(u'unicode string') == False)
tools.ok_(misc.byte_string_valid_xml(self.utf8_japanese))
tools.ok_(misc.byte_string_valid_xml(self.euc_jp_japanese, 'euc_jp'))
tools.ok_(misc.byte_string_valid_xml(self.utf8_japanese, 'euc_jp') == False)
tools.ok_(misc.byte_string_valid_xml(self.euc_jp_japanese, 'utf8') == False)
tools.ok_(misc.byte_string_valid_xml(self.utf8_ascii_chars) == False)
def test_byte_string_valid_encoding(self):
'''Test that a byte sequence is validated'''
tools.ok_(misc.byte_string_valid_encoding(self.utf8_japanese) == True)
tools.ok_(misc.byte_string_valid_encoding(self.euc_jp_japanese, encoding='euc_jp') == True)
def test_byte_string_invalid_encoding(self):
'''Test that we return False with non-encoded chars'''
tools.ok_(misc.byte_string_valid_encoding('\xff') == False)
tools.ok_(misc.byte_string_valid_encoding(self.euc_jp_japanese) == False)
class TestIsStringTypes(unittest.TestCase):
def test_isbasestring(self):
tools.assert_true(misc.isbasestring('abc'))
tools.assert_true(misc.isbasestring(u'abc'))
tools.assert_false(misc.isbasestring(5))
def test_isbytestring(self):
tools.assert_true(misc.isbytestring('abc'))
tools.assert_false(misc.isbytestring(u'abc'))
tools.assert_false(misc.isbytestring(5))
def test_isunicodestring(self):
tools.assert_false(misc.isunicodestring('abc'))
tools.assert_true(misc.isunicodestring(u'abc'))
tools.assert_false(misc.isunicodestring(5))
|