File: test_parsestring.py

package info (click to toggle)
pypy3 7.3.19%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 212,236 kB
  • sloc: python: 2,098,316; ansic: 540,565; sh: 21,462; asm: 14,419; cpp: 4,451; makefile: 4,209; objc: 761; xml: 530; exp: 499; javascript: 314; pascal: 244; lisp: 45; csh: 12; awk: 4
file content (141 lines) | stat: -rw-r--r-- 5,707 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from pypy.interpreter.pyparser import parsestring
from pypy.interpreter.pyparser.error import SyntaxError
from pypy.interpreter.pyparser.parser import Token

import py, sys

class TestParsetring:
    def parse_and_compare(self, literal, value, encoding=None):
        space = self.space
        w_ret = parsestring.parsestr(space, encoding, literal)
        if isinstance(value, str):
            assert space.type(w_ret) == space.w_bytes
            assert space.bytes_w(w_ret) == value
        elif isinstance(value, unicode):
            assert space.type(w_ret) == space.w_unicode
            assert space.utf8_w(w_ret).decode('utf8') == value
        else:
            assert False

    def test_simple(self):
        space = self.space
        for s in ['hello world', 'hello\n world']:
            self.parse_and_compare('b' + repr(s), s)

        self.parse_and_compare("b'''hello\\x42 world'''", 'hello\x42 world')

        # octal
        self.parse_and_compare(r'b"\0"', chr(0))
        self.parse_and_compare(r'br"\0"', '\\0')
        self.parse_and_compare(r'rb"\0"', '\\0')
        self.parse_and_compare(r'b"\07"', chr(7))
        self.parse_and_compare(r'b"\123"', chr(0123))
        self.parse_and_compare(r'b"\400"', chr(0))
        self.parse_and_compare(r'b"\9"', '\\' + '9')
        self.parse_and_compare(r'b"\08"', chr(0) + '8')

        # hexadecimal
        self.parse_and_compare(r'b"\xfF"', chr(0xFF))
        self.parse_and_compare(r'b"\""', '"')
        self.parse_and_compare(r"b'\''", "'")
        for s in (r'b"\x"', r'b"\x7"', r'b"\x7g"'):
            space.raises_w(space.w_ValueError,
                           parsestring.parsestr, space, None, s)

        # only ASCII characters are allowed in bytes literals (but of course
        # you can use escapes to get the non-ASCII ones (note that in the
        # second case we use a raw string, the the parser actually sees the
        # chars '\' 'x' 'e' '9'
        excinfo = py.test.raises(SyntaxError,
                       parsestring.parsestr, space, None,
                       "b'     \xe9'", Token(None, -1, "b'     \xe9'", 5, 0))
        self.parse_and_compare(r"b'\xe9'", chr(0xE9))

    def test_unicode(self):
        for s in ['hello world', 'hello\n world']:
            self.parse_and_compare(repr(s), unicode(s))

        self.parse_and_compare("'''hello\\x42 world'''",
                               u'hello\x42 world')
        self.parse_and_compare("'''hello\\u0842 world'''",
                               u'hello\u0842 world')

        s = "u'\x81'"
        s = s.decode("koi8-u").encode("utf8")[1:]
        w_ret = parsestring.parsestr(self.space, 'koi8-u', s)
        ret = w_ret._utf8.decode('utf8')
        assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81'")

    def test_unicode_pep414(self):
        space = self.space
        for s in [u'hello world', u'hello\n world']:
            self.parse_and_compare(repr(s), unicode(s))

        self.parse_and_compare("u'''hello\\x42 world'''",
                               u'hello\x42 world')
        self.parse_and_compare("u'''hello\\u0842 world'''",
                               u'hello\u0842 world')

        space.raises_w(space.w_ValueError,
                       parsestring.parsestr, space, None, "ur'foo'")

    def test_unicode_literals(self):
        space = self.space
        w_ret = parsestring.parsestr(space, None, repr("hello"))
        assert space.isinstance_w(w_ret, space.w_unicode)
        w_ret = parsestring.parsestr(space, None, "b'hi'")
        assert space.isinstance_w(w_ret, space.w_bytes)
        w_ret = parsestring.parsestr(space, None, "r'hi'")
        assert space.isinstance_w(w_ret, space.w_unicode)

    def test_raw_unicode_literals(self):
        space = self.space
        w_ret = parsestring.parsestr(space, None, "r'\u'")
        assert space.int_w(space.len(w_ret)) == 2

    def test_bytes(self):
        space = self.space
        b = "b'hello'"
        w_ret = parsestring.parsestr(space, None, b)
        assert space.unwrap(w_ret) == "hello"
        b = "b'''hello'''"
        w_ret = parsestring.parsestr(space, None, b)
        assert space.unwrap(w_ret) == "hello"

    def test_simple_enc_roundtrip(self):
        space = self.space
        s = "'\x81\\t'"
        s = s.decode("koi8-u").encode("utf8")
        w_ret = parsestring.parsestr(self.space, 'koi8-u', s)
        ret = space.unwrap(w_ret)
        assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81\\t'")

    def test_multiline_unicode_strings_with_backslash(self):
        space = self.space
        s = '"""' + '\\' + '\n"""'
        w_ret = parsestring.parsestr(space, None, s)
        assert space.text_w(w_ret) == ''

    def test_bug1(self):
        space = self.space
        expected = ['x', ' ', chr(0xc3), chr(0xa9), ' ', '\n']
        input = ["'", 'x', ' ', chr(0xc3), chr(0xa9), ' ', chr(92), 'n', "'"]
        w_ret = parsestring.parsestr(space, 'utf8', ''.join(input))
        assert space.text_w(w_ret) == ''.join(expected)

    def test_wide_unicode_in_source(self):
        if sys.maxunicode == 65535:
            py.test.skip("requires a wide-unicode host")
        self.parse_and_compare('"\xf0\x9f\x92\x8b"',
                               unichr(0x1f48b),
                               encoding='utf-8')

    def test_decode_unicode_utf8(self):
        buf = parsestring.decode_unicode_utf8(self.space,
                                              'u"\xf0\x9f\x92\x8b"', 2, 6)
        assert buf == r"\U0001f48b"

    def test_parsestr_segfault(self):
        space = self.space
        space.raises_w(space.w_UnicodeDecodeError,
                       parsestring.parsestr, space, None, "r'\xc2'")