File: pyparse.py

package info (click to toggle)
pypy3 7.3.11%2Bdfsg-2%2Bdeb12u3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 201,024 kB
  • sloc: python: 1,950,308; ansic: 517,580; sh: 21,417; asm: 14,419; cpp: 4,263; makefile: 4,228; objc: 761; xml: 530; exp: 499; javascript: 314; pascal: 244; lisp: 45; csh: 11; awk: 4
file content (396 lines) | stat: -rw-r--r-- 15,753 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
from pypy.interpreter.error import OperationError
from pypy.interpreter.pyparser import future, parser, pytokenizer, pygram, error, pytoken
from pypy.interpreter.astcompiler import consts
from pypy.module.sys.version import CPYTHON_VERSION
from rpython.rlib import rstring

CPYTHON_MINOR_VERSION = CPYTHON_VERSION[1]

def recode_to_utf8(space, bytes, encoding):
    if encoding == 'utf-8':
        return bytes
    w_text = space.call_method(space.newbytes(bytes), "decode",
                               space.newtext(encoding))
    w_recoded = space.call_method(w_text, "encode", space.newtext("utf-8"))
    return space.bytes_w(w_recoded)

def _normalize_encoding(encoding):
    """returns normalized name for <encoding>

    see dist/src/Parser/tokenizer.c 'get_normal_name()'
    for implementation details / reference

    NOTE: for now, parser.suite() raises a MemoryError when
          a bad encoding is used. (SF bug #979739)
    """
    if encoding is None:
        return None
    # lower() + '_' / '-' conversion
    encoding = encoding.replace('_', '-').lower()
    if encoding == 'utf-8' or encoding.startswith('utf-8-'):
        return 'utf-8'
    for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
        if (encoding == variant or
            encoding.startswith(variant + '-')):
            return 'iso-8859-1'
    return encoding

def _check_for_encoding(s):
    eol = s.find('\n')
    if eol < 0:
        return _check_line_for_encoding(s)[0]
    enc, again = _check_line_for_encoding(s[:eol])
    if enc or not again:
        return enc
    eol2 = s.find('\n', eol + 1)
    if eol2 < 0:
        return _check_line_for_encoding(s[eol + 1:])[0]
    return _check_line_for_encoding(s[eol + 1:eol2])[0]


def _check_line_for_encoding(line):
    """returns the declared encoding or None"""
    i = 0
    for i in range(len(line)):
        if line[i] == '#':
            break
        if line[i] not in ' \t\014':
            return None, False  # Not a comment, don't read the second line.
    return pytokenizer.match_encoding_declaration(line[i:]), True


class CompileInfo(object):
    """Stores information about the source being compiled.

    * filename: The filename of the source.
    * mode: The parse mode to use. ('exec', 'eval', 'single' or 'func_type')
    * flags: Parser and compiler flags.
    * encoding: The source encoding.
    * last_future_import: The line number and offset of the last __future__
      import.
    * hidden_applevel: Will this code unit and sub units be hidden at the
      applevel?
    * optimize: optimization level:
         0 = no optmiziation,
         1 = remove asserts,
         2 = remove docstrings.
    """

    def __init__(self, filename, mode="exec", flags=0, future_pos=(0, 0),
                 hidden_applevel=False, optimize=0, feature_version=-1):
        assert optimize >= 0
        if feature_version == -1:
            feature_version = CPYTHON_MINOR_VERSION
        if feature_version < 7:
            flags |= consts.PyCF_ASYNC_HACKS
        rstring.check_str0(filename)
        self.filename = filename
        self.mode = mode
        self.encoding = None
        self.flags = flags
        self.optimize = optimize
        self.last_future_import = future_pos
        self.hidden_applevel = hidden_applevel
        self.feature_version = feature_version


_targets = {
'eval' : pygram.syms.eval_input,
'single' : pygram.syms.single_input,
'exec' : pygram.syms.file_input,
'func_type' : pygram.syms.func_type_input,
}

class PythonParser(parser.Parser):

    def __init__(self, space, future_flags=future.futureFlags_3_9,
                 grammar=pygram.python_grammar):
        parser.Parser.__init__(self, grammar)
        self.space = space
        self.future_flags = future_flags
        self.type_ignores = []

    def reset(self):
        self.type_ignores = []

    def parse_source(self, bytessrc, compile_info):
        """Main entry point for parsing Python source.

        Everything from decoding the source to tokenizing to building the parse
        tree is handled here.
        """
        textsrc = self._handle_encoding(bytessrc, compile_info, self.space)
        return self._parse(textsrc, compile_info)

    @staticmethod
    def _handle_encoding(bytessrc, compile_info, space):
        # Detect source encoding. also updates compile_info.flags
        explicit_encoding = False
        enc = None
        if compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
            enc = 'utf-8'

        if compile_info.flags & consts.PyCF_IGNORE_COOKIE:
            textsrc = bytessrc
        elif bytessrc.startswith("\xEF\xBB\xBF"):
            bytessrc = bytessrc[3:]
            enc = 'utf-8'
            # If an encoding is explicitly given check that it is utf-8.
            decl_enc = _check_for_encoding(bytessrc)
            explicit_encoding = (decl_enc is not None)
            if decl_enc and _normalize_encoding(decl_enc) != "utf-8":
                raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,
                                        filename=compile_info.filename)
            textsrc = bytessrc
        else:
            enc = _normalize_encoding(_check_for_encoding(bytessrc))
            explicit_encoding = (enc is not None)
            if enc is None:
                enc = 'utf-8'
            try:
                textsrc = recode_to_utf8(space, bytessrc, enc)
            except OperationError as e:
                # if the codec is not found, LookupError is raised.  we
                # check using 'is_w' not to mask potential IndexError or
                # KeyError
                if e.match(space, space.w_LookupError):
                    raise error.SyntaxError("Unknown encoding: %s" % enc,
                                            filename=compile_info.filename)
                # Transform unicode errors into SyntaxError
                if e.match(space, space.w_UnicodeDecodeError):
                    e.normalize_exception(space)
                    w_message = space.str(e.get_w_value(space))
                    raise error.SyntaxError(space.text_w(w_message))
                raise
        if enc is not None:
            compile_info.encoding = enc
        if explicit_encoding:
            compile_info.flags |= consts.PyCF_FOUND_ENCODING
        return textsrc

    def _get_possible_arcs(self, arcs):
        # Handle func_body_suite separately for expecting an IndentationError
        # instead of a normal SyntaxError when the indentation is not supplied.
        if len(arcs) == 2:
            for n, arc in enumerate(arcs):
                if self.grammar.labels[arc[0]] == pygram.tokens.TYPE_COMMENT:
                    break
            else:
                return arcs
            arcs.pop(n)
        return arcs

    def _parse(self, textsrc, compile_info):
        flags = compile_info.flags

        # The tokenizer is very picky about how it wants its input.
        source_lines = textsrc.splitlines(True)
        if source_lines and not source_lines[-1].endswith("\n"):
            source_lines[-1] += '\n'
        if textsrc and textsrc[-1] == "\n":
            flags &= ~consts.PyCF_DONT_IMPLY_DEDENT

        self.prepare(_targets[compile_info.mode])
        try:
            last_token_seen = None
            next_token_seen = None
            try:
                # Note: we no longer pass the CO_FUTURE_* to the tokenizer,
                # which is expected to work independently of them.  It's
                # certainly the case for all futures in Python <= 2.7.
                tokens = pytokenizer.generate_tokens(source_lines, flags)
            except error.TokenError as e:
                e.filename = compile_info.filename
                raise
            except error.TokenIndentationError as e:
                e.filename = compile_info.filename
                raise

            newflags, last_future_import = (
                future.add_future_flags(self.future_flags, tokens))
            compile_info.last_future_import = last_future_import
            compile_info.flags |= newflags

            self.grammar = pygram.choose_grammar(
                print_function=True,
                revdb=self.space.config.translation.reverse_debugger)
            try:
                tokens_stream = iter(tokens)

                token = None
                for token in tokens_stream:
                    next_token_seen = token
                    # Special handling for TYPE_IGNOREs
                    if token.token_type == pygram.tokens.TYPE_IGNORE:
                        self.type_ignores.append(token)
                    elif self.add_token(token):
                        break
                    last_token_seen = token
                last_token_seen = None
                next_token_seen = None

                if compile_info.mode == 'single':
                    self._check_token_stream_single(compile_info, tokens_stream)

            except error.TokenError as e:
                e.filename = compile_info.filename
                raise
            except error.TokenIndentationError as e:
                e.filename = compile_info.filename
                raise
            except parser.ParseError as e:
                # Catch parse errors, pretty them up and reraise them as a
                # SyntaxError.
                new_err = error.IndentationError
                if e.token.token_type == pygram.tokens.INDENT:
                    msg = "unexpected indent"
                elif e.expected == pygram.tokens.INDENT:
                    # hack a bit to find the line that starts the block. should
                    # be two stack entries back
                    msg = _compute_indentation_error_msg(self.stack)
                else:
                    new_err = error.SyntaxError
                    if (last_token_seen is not None and
                            last_token_seen.value in ('print', 'exec') and
                            next_token_seen is not None and
                            next_token_seen.value != '('):
                        msg = "Missing parentheses in call to '%s'" % (
                            last_token_seen.value,)
                    else:
                        msg = "invalid syntax"
                    if e.expected_str is not None:
                        msg += " (expected '%s')" % e.expected_str

                # parser.ParseError(...).column is 0-based, but the offsets in the
                # exceptions in the error module are 1-based, hence the '+ 1'
                raise new_err(msg, e.token.lineno, e.token.column + 1, e.token.line,
                              compile_info.filename)
            else:
                tree = self.root
        finally:
            # Avoid hanging onto the tree.
            self.root = None
        return tree

    @staticmethod
    def _check_token_stream_single(compile_info, tokens_stream):
        for token in tokens_stream:
            if token.token_type == pygram.tokens.ENDMARKER:
                break
            if token.token_type == pygram.tokens.NEWLINE:
                continue

            if token.token_type == pygram.tokens.COMMENT:
                for token in tokens_stream:
                    if token.token_type == pygram.tokens.NEWLINE:
                        break
            else:
                new_err = error.SyntaxError
                msg = ("multiple statements found while "
                       "compiling a single statement")
                raise new_err(msg, token.lineno, token.column,
                              token.line, compile_info.filename)

SUITE_STARTERS = dict.fromkeys("def if elif else while for try except finally with class".split())

def _compute_indentation_error_msg(stack):
    if stack.next is not None:
        stack = stack.next
        node = stack.node
        num_children = node.num_children()
        if num_children and \
                node.get_child(num_children - 1).type == pygram.tokens.COLON:
            # found the start of the suite, no identify the terminal before the
            # token that can start a suite
            for i in range(num_children - 2, -1, -1):
                child = node.get_child(i)
                if child.type == pygram.tokens.NAME:
                    assert isinstance(child, parser.Terminal)
                    if child.value not in SUITE_STARTERS:
                        continue
                    if child.value == "def":
                        return "expected an indented block after function definition on line %s" % (
                                child.lineno)

                    return "expected an indented block after '%s' statement on line %s" % (
                            child.value, child.lineno)
    return "expected an indented block"


class PegParser(object):
    def __init__(self, space, future_flags=future.futureFlags_3_9):
        self.space = space
        self.future_flags = future_flags
        self.type_ignores = []

    def reset(self):
        pass

    def parse_source(self, bytessrc, compile_info):
        """Main entry point for parsing Python source.

        Everything from decoding the source to tokenizing to building the parse
        tree is handled here.
        """
        textsrc = PythonParser._handle_encoding(bytessrc, compile_info, self.space)
        return self._parse(textsrc, compile_info)

    def _parse(self, textsrc, compile_info):
        from pypy.interpreter.pyparser.rpypegparse import PythonParser
        # XXX too much copy-paste
        flags = compile_info.flags

        # The tokenizer is very picky about how it wants its input.
        source_lines = textsrc.splitlines(True)
        if source_lines and not source_lines[-1].endswith("\n"):
            source_lines[-1] += '\n'
        if textsrc and textsrc[-1] == "\n" or compile_info.mode != "single":
            flags &= ~consts.PyCF_DONT_IMPLY_DEDENT

        try:
            # Note: we no longer pass the CO_FUTURE_* to the tokenizer,
            # which is expected to work independently of them.  It's
            # certainly the case for all futures in Python <= 2.7.
            tokens = pytokenizer.generate_tokens(source_lines, flags)
        except error.TokenError as e:
            e.filename = compile_info.filename
            raise
        except error.TokenIndentationError as e:
            e.filename = compile_info.filename
            raise


        newflags, last_future_import = (
            future.add_future_flags(self.future_flags, tokens))
        compile_info.last_future_import = last_future_import
        compile_info.flags |= newflags

        mode = compile_info.mode
        if mode != "single":
            assert tokens[-2].token_type == pytoken.python_tokens['NEWLINE']
            del tokens[-2]
        pp = PythonParser(self.space, tokens, compile_info)
        try:
            for token in tokens:
                # Special handling for TYPE_IGNOREs
                if token.token_type == pygram.tokens.TYPE_IGNORE:
                    self.type_ignores.append(token)
            if mode == "exec":
                meth = PythonParser.file
            elif mode == "single":
                meth = PythonParser.interactive
            elif mode == "eval":
                meth = PythonParser.eval
            elif mode == "func_type":
                meth = PythonParser.func_type
            else:
                assert 0, "unknown mode"
            return pp.parse_meth_or_raise(meth)
        except error.TokenError as e:
            e.filename = compile_info.filename
            raise
        except error.TokenIndentationError as e:
            e.filename = compile_info.filename
            raise