1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
import py
import sys
class AppTestUnicodeData:
spaceconfig = dict(usemodules=('unicodedata',))
def test_hangul_syllables(self):
import unicodedata
# Test all leading, vowel and trailing jamo
# but not every combination of them.
for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'),
(0xAE69, 'HANGUL SYLLABLE GGAEG'),
(0xB0D2, 'HANGUL SYLLABLE NYAGG'),
(0xB33B, 'HANGUL SYLLABLE DYAEGS'),
(0xB5A4, 'HANGUL SYLLABLE DDEON'),
(0xB80D, 'HANGUL SYLLABLE RENJ'),
(0xBA76, 'HANGUL SYLLABLE MYEONH'),
(0xBCDF, 'HANGUL SYLLABLE BYED'),
(0xBF48, 'HANGUL SYLLABLE BBOL'),
(0xC1B1, 'HANGUL SYLLABLE SWALG'),
(0xC41A, 'HANGUL SYLLABLE SSWAELM'),
(0xC683, 'HANGUL SYLLABLE OELB'),
(0xC8EC, 'HANGUL SYLLABLE JYOLS'),
(0xCB55, 'HANGUL SYLLABLE JJULT'),
(0xCDBE, 'HANGUL SYLLABLE CWEOLP'),
(0xD027, 'HANGUL SYLLABLE KWELH'),
(0xD290, 'HANGUL SYLLABLE TWIM'),
(0xD4F9, 'HANGUL SYLLABLE PYUB'),
(0xD762, 'HANGUL SYLLABLE HEUBS'),
(0xAE27, 'HANGUL SYLLABLE GYIS'),
(0xB090, 'HANGUL SYLLABLE GGISS'),
(0xB0AD, 'HANGUL SYLLABLE NANG'),
(0xB316, 'HANGUL SYLLABLE DAEJ'),
(0xB57F, 'HANGUL SYLLABLE DDYAC'),
(0xB7E8, 'HANGUL SYLLABLE RYAEK'),
(0xBA51, 'HANGUL SYLLABLE MEOT'),
(0xBCBA, 'HANGUL SYLLABLE BEP'),
(0xBF23, 'HANGUL SYLLABLE BBYEOH'),
(0xD7A3, 'HANGUL SYLLABLE HIH')):
assert unicodedata.name(chr(code)) == name
assert unicodedata.lookup(name) == chr(code)
# Test outside the range
raises(ValueError, unicodedata.name, chr(0xAC00 - 1))
raises(ValueError, unicodedata.name, chr(0xD7A3 + 1))
def test_cjk(self):
import sys
import unicodedata
assert unicodedata.unidata_version >= "8"
cases = [
('3400', '4DB5'),
('4E00', '9FD5'),
('20000', '2A6D6'),
('2A700', '2B734'),
('2B740', '2B81D'),
('2B820', '2CEA1'),
]
for first, last in cases:
first = int(first, 16)
last = int(last, 16)
# Test at and inside the boundary
for i in (first, first + 1, last - 1, last):
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
char = chr(i)
assert unicodedata.name(char) == charname
assert unicodedata.lookup(charname) == char
# Test outside the boundary
for i in first - 1, last + 1:
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
char = chr(i)
try:
unicodedata.name(char)
except ValueError as e:
assert str(e) == 'no such name'
raises(KeyError, unicodedata.lookup, charname)
def test_bug_1704793(self): # from CPython
import unicodedata
assert unicodedata.lookup("GOTHIC LETTER FAIHU") == '\U00010346'
def test_normalize_bad_argcount(self):
import unicodedata
raises(TypeError, unicodedata.normalize, 'x')
def test_normalize_nonunicode(self):
import unicodedata
exc_info = raises(TypeError, unicodedata.normalize, 'NFC', b'x')
assert 'must be unicode, not' in str(exc_info.value)
@py.test.mark.skipif("sys.maxunicode < 0x10ffff")
def test_normalize_wide(self):
import unicodedata
assert unicodedata.normalize('NFC', '\U000110a5\U000110ba') == '\U000110ab'
def test_linebreaks(self):
linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
0x1c, 0x1d, 0x1e, 0x2028, 0x2029)
for i in linebreaks:
for j in range(-2, 3):
lines = (chr(i + j) + 'A').splitlines()
if i + j in linebreaks:
assert len(lines) == 2
else:
assert len(lines) == 1
def test_mirrored(self):
import unicodedata
# For no reason, unicodedata.mirrored() returns an int, not a bool
assert repr(unicodedata.mirrored(' ')) == '0'
def test_bidirectional_not_one_character(self):
import unicodedata
exc_info = raises(TypeError, unicodedata.bidirectional, u'xx')
assert str(exc_info.value) == 'need a single Unicode character as parameter'
def test_aliases(self):
import unicodedata
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
name = unicodedata.name(chr(codepoint))
assert name != alias
assert unicodedata.lookup(alias) == unicodedata.lookup(name)
raises(KeyError, unicodedata.ucd_3_2_0.lookup, alias)
def test_named_sequences(self):
import unicodedata
sequences = [
('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
]
for seqname, codepoints in sequences:
assert unicodedata.lookup(seqname) == codepoints
raises(SyntaxError, eval, r'"\N{%s}"' % seqname)
def test_names_in_pua_range(self):
# We are storing named seq in the PUA 15, but their names shouldn't leak
import unicodedata
for cp in range(0xf0000, 0xf0300, 7):
exc = raises(ValueError, unicodedata.name, chr(cp))
assert str(exc.value) == 'no such name'
|