File: test_unicodehelper.py

package info (click to toggle)
pypy3 7.3.19%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 212,236 kB
  • sloc: python: 2,098,316; ansic: 540,565; sh: 21,462; asm: 14,419; cpp: 4,451; makefile: 4,209; objc: 761; xml: 530; exp: 499; javascript: 314; pascal: 244; lisp: 45; csh: 12; awk: 4
file content (168 lines) | stat: -rw-r--r-- 6,086 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# encoding: utf-8
import pytest

from pypy.interpreter.unicodehelper import (
    utf8_encode_utf_8, decode_utf8sp,
)

from pypy.interpreter.unicodehelper import str_decode_utf8, utf8_encode_latin_1
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
from pypy.interpreter.unicodehelper import str_decode_unicode_escape
from pypy.interpreter.unicodehelper import str_decode_raw_unicode_escape
from pypy.interpreter.unicodehelper import utf8_encode_utf_16_le
from pypy.interpreter.unicodehelper import utf8_encode_utf_32_le
from pypy.interpreter import unicodehelper as uh
from pypy.module._codecs.interp_codecs import CodecState

class Hit(Exception):
    pass

class FakeSpace:
    def __init__(self, space):
        self.space = space
    def __getattr__(self, name):
        if name in ('w_UnicodeEncodeError', 'w_UnicodeDecodeError'):
            raise Hit
        raise AttributeError(name)
    def newbytes(self, s):
        return s
    def newtext(self, s):
        return self.space.newtext(s)

def test_encode_utf_8_combine_surrogates(space):
    """
    In the case of a surrogate pair, the error handler should
    called with a start and stop position of the full surrogate
    pair (new behavior in python3.6)
    """
    #               /--surrogate pair--\
    #    \udc80      \ud800      \udfff
    b = "\xed\xb2\x80\xed\xa0\x80\xed\xbf\xbf"
    _space = FakeSpace(space)

    calls = []

    def errorhandler(errors, encoding, msg, w_s, start, end):
        """
        This handler will be called twice, so asserting both times:

        1. the first time, 0xDC80 will be handled as a single surrogate,
           since it is a standalone character and an invalid surrogate.
        2. the second time, the characters will be 0xD800 and 0xDFFF, since
           that is a valid surrogate pair.
        """
        s = w_s._utf8
        calls.append(s.decode("utf-8")[start:end])
        return 'abc', end, 'b', s, w_s
    w_b = space.newtext(b)
    res = utf8_encode_utf_8(
        space, b, w_b, 'strict',
        errorhandler=errorhandler,
        allow_surrogates=False
    )
    assert res == "abcabc"
    assert calls == [u'\udc80', u'\uD800\uDFFF']

#def test_bad_error_handler():
    # replaced by the test test_repeated_pos_return
    # in test_codecs. following CPython's approach

def test_decode_utf8sp():
    space = FakeSpace(None)
    assert decode_utf8sp(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1, 3)
    assert decode_utf8sp(space, "\xed\xb0\x80") == ("\xed\xb0\x80", 1, 3)
    got = decode_utf8sp(space, "\xed\xa0\x80\xed\xb0\x80")
    assert map(ord, got[0].decode('utf8')) == [0xd800, 0xdc00]
    got = decode_utf8sp(space, "\xf0\x90\x80\x80")
    assert map(ord, got[0].decode('utf8')) == [0x10000]


def test_utf8_encode_latin1_ascii_prefix():
    space = FakeSpace(None)
    utf8 = b'abcde\xc3\xa4g'
    b = utf8_encode_latin_1(space, utf8, utf8, None, None)
    assert b == b'abcde\xe4g'

def test_latin1_shortcut_bug(space):
    state = space.fromcache(CodecState)
    handler = state.encode_error_handler

    sin = u"a\xac\u1234\u20ac\u8000"
    sin_utf8 = sin.encode("utf-8")
    assert utf8_encode_latin_1(space, sin_utf8, space.newtext(sin_utf8), "backslashreplace", handler) == sin.encode("latin-1", "backslashreplace")

def test_unicode_escape_incremental_bug(space):
    class FakeUnicodeDataHandler:
        def call(self, name):
            assert name == "QUESTION MARK"
            return ord("?")
    unicodedata_handler = FakeUnicodeDataHandler()
    input = u"äҰ𐀂?"
    data = b'\\xe4\\u04b0\\U00010002\\N{QUESTION MARK}'
    for i in range(1, len(data)):
        s = data[:i]
        w_s = space.newtext(s)
        result1, _, lgt1, _ = str_decode_unicode_escape(space, s, w_s, 'strict', False, None, unicodedata_handler)
        s1 = data[lgt1:i] + data[i:]
        w_s1 = space.newtext(s1)
        result2, _, lgt2, _ = str_decode_unicode_escape(space, s1, w_s1, 'strict', True, None, unicodedata_handler)
        assert lgt1 + lgt2 == len(data)
        assert input == (result1 + result2).decode("utf-8")

def test_raw_unicode_escape_incremental_bug(space):
    input = u"xҰa𐀂"
    data = b'x\\u04b0a\\U00010002'
    for i in range(1, len(data)):
        s = data[:i]
        w_s = space.newtext(s)
        result1, _, lgt1 = str_decode_raw_unicode_escape(space, s, w_s, 'strict', False, None)
        s = data[lgt1:i] + data[i:]
        w_s = space.newtext(s)
        result2, _, lgt2 = str_decode_raw_unicode_escape(space, s, w_s, 'strict', True, None)
        assert lgt1 + lgt2 == len(data)
        assert input == (result1 + result2).decode("utf-8")

def test_raw_unicode_escape_backslash_without_escape():
    data = b'[:/?#[\\]@]\\'
    space = FakeSpace(None)
    result, _, l = str_decode_raw_unicode_escape(space, data, data, 'strict', True, None)
    assert l == len(data)
    assert result == data

def test_raw_unicode_escape_bug_escape_backslash():
    data = b'\\\\'
    space = FakeSpace(None)
    res = str_decode_raw_unicode_escape(space, data, data, 'strict', True, None)
    assert res[0] == '\\\\'

    data = b'\\\xef'
    res = str_decode_raw_unicode_escape(space, data, data, 'strict', True, None)
    assert res[0].decode("utf-8") == u'\\\xef'

def test_utf16_encode_bytes_replacement_is_simply_copied():
    space = FakeSpace(None)
    def errorhandler(errors, encoding, msg, s, start, end):
        return 'abcd', end, 'b', s, s

    s = b'[\xed\xb2\x80]'
    res = utf8_encode_utf_16_le(
        space, s, s, 'strict',
        errorhandler=errorhandler,
        allow_surrogates=False
    )
    assert res == "[\x00abcd]\x00"


def test_utf32_encode_bytes_replacement_is_simply_copied(space):
    _space = FakeSpace(space)
    s = b'[\xed\xb2\x80]'
    def errorhandler(errors, encoding, msg, w_s, start, end):
        return 'abcd', end, 'b', s, w_s

    res = utf8_encode_utf_32_le(
        _space, s, space.newtext(s), 'strict',
        errorhandler=errorhandler,
        allow_surrogates=False
    )
    assert res == "[\x00\x00\x00abcd]\x00\x00\x00"