File: test_unicodehelper.py

package info (click to toggle)
pypy3 7.3.11%2Bdfsg-2%2Bdeb12u3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 201,024 kB
  • sloc: python: 1,950,308; ansic: 517,580; sh: 21,417; asm: 14,419; cpp: 4,263; makefile: 4,228; objc: 761; xml: 530; exp: 499; javascript: 314; pascal: 244; lisp: 45; csh: 11; awk: 4
file content (115 lines) | stat: -rw-r--r-- 4,375 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# encoding: utf-8
import pytest

from pypy.interpreter.unicodehelper import (
    utf8_encode_utf_8, decode_utf8sp, ErrorHandlerError
)

from pypy.interpreter.unicodehelper import str_decode_utf8, utf8_encode_latin_1
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
from pypy.interpreter.unicodehelper import utf8_encode_latin_1, str_decode_unicode_escape
from pypy.interpreter.unicodehelper import str_decode_raw_unicode_escape
from pypy.interpreter import unicodehelper as uh
from pypy.module._codecs.interp_codecs import CodecState

class Hit(Exception):
    pass

class FakeSpace:
    def __getattr__(self, name):
        if name in ('w_UnicodeEncodeError', 'w_UnicodeDecodeError'):
            raise Hit
        raise AttributeError(name)


def test_encode_utf_8_combine_surrogates():
    """
    In the case of a surrogate pair, the error handler should
    called with a start and stop position of the full surrogate
    pair (new behavior in python3.6)
    """
    #               /--surrogate pair--\
    #    \udc80      \ud800      \udfff
    b = "\xed\xb2\x80\xed\xa0\x80\xed\xbf\xbf"

    calls = []

    def errorhandler(errors, encoding, msg, s, start, end):
        """
        This handler will be called twice, so asserting both times:

        1. the first time, 0xDC80 will be handled as a single surrogate,
           since it is a standalone character and an invalid surrogate.
        2. the second time, the characters will be 0xD800 and 0xDFFF, since
           that is a valid surrogate pair.
        """
        calls.append(s.decode("utf-8")[start:end])
        return 'abc', end, 'b', s

    res = utf8_encode_utf_8(
        b, 'strict',
        errorhandler=errorhandler,
        allow_surrogates=False
    )
    assert res == "abcabc"
    assert calls == [u'\udc80', u'\uD800\uDFFF']

def test_bad_error_handler():
    b = u"\udc80\ud800\udfff".encode("utf-8")
    def errorhandler(errors, encoding, msg, s, start, end):
        return '', start, 'b', s # returned index is too small

    pytest.raises(ErrorHandlerError, utf8_encode_utf_8, b, 'strict',
                  errorhandler=errorhandler, allow_surrogates=False)

def test_decode_utf8sp():
    space = FakeSpace()
    assert decode_utf8sp(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1, 3)
    assert decode_utf8sp(space, "\xed\xb0\x80") == ("\xed\xb0\x80", 1, 3)
    got = decode_utf8sp(space, "\xed\xa0\x80\xed\xb0\x80")
    assert map(ord, got[0].decode('utf8')) == [0xd800, 0xdc00]
    got = decode_utf8sp(space, "\xf0\x90\x80\x80")
    assert map(ord, got[0].decode('utf8')) == [0x10000]


def test_utf8_encode_latin1_ascii_prefix():
    utf8 = b'abcde\xc3\xa4g'
    b = utf8_encode_latin_1(utf8, None, None)
    assert b == b'abcde\xe4g'

def test_latin1_shortcut_bug(space):
    state = space.fromcache(CodecState)
    handler = state.encode_error_handler

    sin = u"a\xac\u1234\u20ac\u8000"
    assert utf8_encode_latin_1(sin.encode("utf-8"), "backslashreplace", handler) == sin.encode("latin-1", "backslashreplace")

def test_unicode_escape_incremental_bug():
    class FakeUnicodeDataHandler:
        def call(self, name):
            assert name == "QUESTION MARK"
            return ord("?")
    unicodedata_handler = FakeUnicodeDataHandler()
    input = u"äҰ𐀂?"
    data = b'\\xe4\\u04b0\\U00010002\\N{QUESTION MARK}'
    for i in range(1, len(data)):
        result1, _, lgt1, _ = str_decode_unicode_escape(data[:i], 'strict', False, None, unicodedata_handler)
        result2, _, lgt2, _ = str_decode_unicode_escape(data[lgt1:i] + data[i:], 'strict', True, None, unicodedata_handler)
        assert lgt1 + lgt2 == len(data)
        assert input == (result1 + result2).decode("utf-8")

def test_raw_unicode_escape_incremental_bug():
    input = u"xҰa𐀂"
    data = b'x\\u04b0a\\U00010002'
    for i in range(1, len(data)):
        result1, _, lgt1 = str_decode_raw_unicode_escape(data[:i], 'strict', False, None)
        result2, _, lgt2 = str_decode_raw_unicode_escape(data[lgt1:i] + data[i:], 'strict', True, None)
        assert lgt1 + lgt2 == len(data)
        assert input == (result1 + result2).decode("utf-8")

def test_raw_unicode_escape_backslash_without_escape():
    data = b'[:/?#[\\]@]\\'
    result, _, l = str_decode_raw_unicode_escape(data, 'strict', True, None)
    assert l == len(data)
    assert result == data