File: test_unicode.py

package info (click to toggle)
python3.14 3.14.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 169,680 kB
  • sloc: python: 751,968; ansic: 717,163; xml: 31,250; sh: 5,989; cpp: 4,063; makefile: 1,995; objc: 787; lisp: 502; javascript: 136; asm: 75; csh: 12
file content (138 lines) | stat: -rw-r--r-- 6,019 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import codecs
from collections import OrderedDict
from test.test_json import PyTest, CTest


class TestUnicode:
    # test_encoding1 and test_encoding2 from 2.x are irrelevant (only str
    # is supported as input, not bytes).

    def test_encoding3(self):
        u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
        j = self.dumps(u)
        self.assertEqual(j, '"\\u03b1\\u03a9"')

    def test_encoding4(self):
        u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
        j = self.dumps([u])
        self.assertEqual(j, '["\\u03b1\\u03a9"]')

    def test_encoding5(self):
        u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
        j = self.dumps(u, ensure_ascii=False)
        self.assertEqual(j, f'"{u}"')

    def test_encoding6(self):
        u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
        j = self.dumps([u], ensure_ascii=False)
        self.assertEqual(j, f'["{u}"]')

    def test_encoding7(self):
        u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
        j = self.dumps(u + "\n", ensure_ascii=False)
        self.assertEqual(j, f'"{u}\\n"')

    def test_ascii_non_printable_encode(self):
        u = '\b\t\n\f\r\x00\x1f\x7f'
        self.assertEqual(self.dumps(u),
                         '"\\b\\t\\n\\f\\r\\u0000\\u001f\\u007f"')
        self.assertEqual(self.dumps(u, ensure_ascii=False),
                         '"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"')

    def test_ascii_non_printable_decode(self):
        self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'),
                         '\b\t\n\f\r')
        s = ''.join(map(chr, range(32)))
        for c in s:
            self.assertRaises(self.JSONDecodeError, self.loads, f'"{c}"')
        self.assertEqual(self.loads(f'"{s}"', strict=False), s)
        self.assertEqual(self.loads('"\x7f"'), '\x7f')

    def test_escaped_decode(self):
        self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'), '\b\t\n\f\r')
        self.assertEqual(self.loads('"\\"\\\\\\/"'), '"\\/')
        for c in set(map(chr, range(0x100))) - set('"\\/bfnrt'):
            self.assertRaises(self.JSONDecodeError, self.loads, f'"\\{c}"')
            self.assertRaises(self.JSONDecodeError, self.loads, f'"\\{c}"', strict=False)

    def test_big_unicode_encode(self):
        u = '\U0001d120'
        self.assertEqual(self.dumps(u), '"\\ud834\\udd20"')
        self.assertEqual(self.dumps(u, ensure_ascii=False), '"\U0001d120"')

    def test_big_unicode_decode(self):
        u = 'z\U0001d120x'
        self.assertEqual(self.loads(f'"{u}"'), u)
        self.assertEqual(self.loads('"z\\ud834\\udd20x"'), u)

    def test_unicode_decode(self):
        for i in range(0, 0xd7ff):
            u = chr(i)
            s = f'"\\u{i:04x}"'
            self.assertEqual(self.loads(s), u)

    def test_single_surrogate_encode(self):
        self.assertEqual(self.dumps('\uD83D'), '"\\ud83d"')
        self.assertEqual(self.dumps('\uD83D', ensure_ascii=False), '"\ud83d"')
        self.assertEqual(self.dumps('\uDC0D'), '"\\udc0d"')
        self.assertEqual(self.dumps('\uDC0D', ensure_ascii=False), '"\udc0d"')

    def test_single_surrogate_decode(self):
        self.assertEqual(self.loads('"\uD83D"'), '\ud83d')
        self.assertEqual(self.loads('"\\uD83D"'), '\ud83d')
        self.assertEqual(self.loads('"\udc0d"'), '\udc0d')
        self.assertEqual(self.loads('"\\udc0d"'), '\udc0d')

    def test_unicode_preservation(self):
        self.assertEqual(type(self.loads('""')), str)
        self.assertEqual(type(self.loads('"a"')), str)
        self.assertEqual(type(self.loads('["a"]')[0]), str)

    def test_bytes_encode(self):
        self.assertRaises(TypeError, self.dumps, b"hi")
        self.assertRaises(TypeError, self.dumps, [b"hi"])

    def test_bytes_decode(self):
        for encoding, bom in [
                ('utf-8', codecs.BOM_UTF8),
                ('utf-16be', codecs.BOM_UTF16_BE),
                ('utf-16le', codecs.BOM_UTF16_LE),
                ('utf-32be', codecs.BOM_UTF32_BE),
                ('utf-32le', codecs.BOM_UTF32_LE),
            ]:
            data = ["a\xb5\u20ac\U0001d120"]
            encoded = self.dumps(data).encode(encoding)
            self.assertEqual(self.loads(bom + encoded), data)
            self.assertEqual(self.loads(encoded), data)
        self.assertRaises(UnicodeDecodeError, self.loads, b'["\x80"]')
        # RFC-7159 and ECMA-404 extend JSON to allow documents that
        # consist of only a string, which can present a special case
        # not covered by the encoding detection patterns specified in
        # RFC-4627 for utf-16-le (XX 00 XX 00).
        self.assertEqual(self.loads('"\u2600"'.encode('utf-16-le')),
                         '\u2600')
        # Encoding detection for small (<4) bytes objects
        # is implemented as a special case. RFC-7159 and ECMA-404
        # allow single codepoint JSON documents which are only two
        # bytes in utf-16 encodings w/o BOM.
        self.assertEqual(self.loads(b'5\x00'), 5)
        self.assertEqual(self.loads(b'\x007'), 7)
        self.assertEqual(self.loads(b'57'), 57)

    def test_object_pairs_hook_with_unicode(self):
        s = '{"xkd":1, "kcw":2, "art":3, "hxm":4, "qrt":5, "pad":6, "hoy":7}'
        p = [("xkd", 1), ("kcw", 2), ("art", 3), ("hxm", 4),
             ("qrt", 5), ("pad", 6), ("hoy", 7)]
        self.assertEqual(self.loads(s), eval(s))
        self.assertEqual(self.loads(s, object_pairs_hook = lambda x: x), p)
        od = self.loads(s, object_pairs_hook = OrderedDict)
        self.assertEqual(od, OrderedDict(p))
        self.assertEqual(type(od), OrderedDict)
        # the object_pairs_hook takes priority over the object_hook
        self.assertEqual(self.loads(s, object_pairs_hook = OrderedDict,
                                    object_hook = lambda x: None),
                         OrderedDict(p))


class TestPyUnicode(TestUnicode, PyTest): pass
class TestCUnicode(TestUnicode, CTest): pass