File: fast.py

package info (click to toggle)
python-jedi 0.10.0~git1%2Bf05c071-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,064 kB
  • ctags: 3,014
  • sloc: python: 16,997; makefile: 149; ansic: 13
file content (621 lines) | stat: -rw-r--r-- 23,372 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
"""
Basically a parser that is faster, because it tries to parse only parts and if
anything changes, it only reparses the changed parts. But because it's not
finished (and still not working as I want), I won't document it any further.
"""
import re
from itertools import chain

from jedi._compatibility import use_metaclass
from jedi import settings
from jedi.parser import ParserWithRecovery
from jedi.parser import tree
from jedi.parser.utils import underscore_memoization, parser_cache
from jedi import debug
from jedi.parser.tokenize import (source_tokens, NEWLINE,
                                  ENDMARKER, INDENT, DEDENT)

FLOWS = 'if', 'else', 'elif', 'while', 'with', 'try', 'except', 'finally', 'for'


class FastModule(tree.Module):
    type = 'file_input'

    def __init__(self, module_path):
        super(FastModule, self).__init__([])
        self.modules = []
        self.reset_caches()
        self.names_dict = {}
        self.path = module_path

    def reset_caches(self):
        self.modules = []
        try:
            del self._used_names  # Remove the used names cache.
        except AttributeError:
            pass  # It was never used.

    @property
    @underscore_memoization
    def used_names(self):
        return MergedNamesDict([m.used_names for m in self.modules])

    @property
    def global_names(self):
        return [name for m in self.modules for name in m.global_names]

    @property
    def error_statements(self):
        return [e for m in self.modules for e in m.error_statements]

    def __repr__(self):
        return "<fast.%s: %s@%s-%s>" % (type(self).__name__, self.name,
                                        self.start_pos[0], self.end_pos[0])

    # To avoid issues with with the `parser.ParserWithRecovery`, we need
    # setters that do nothing, because if pickle comes along and sets those
    # values.
    @global_names.setter
    def global_names(self, value):
        pass

    @error_statements.setter
    def error_statements(self, value):
        pass

    @used_names.setter
    def used_names(self, value):
        pass


class MergedNamesDict(object):
    def __init__(self, dicts):
        self.dicts = dicts

    def __iter__(self):
        return iter(set(key for dct in self.dicts for key in dct))

    def __getitem__(self, value):
        return list(chain.from_iterable(dct.get(value, []) for dct in self.dicts))

    def items(self):
        dct = {}
        for d in self.dicts:
            for key, values in d.items():
                try:
                    dct_values = dct[key]
                    dct_values += values
                except KeyError:
                    dct[key] = list(values)
        return dct.items()

    def values(self):
        lst = []
        for dct in self.dicts:
            lst += dct.values()
        return lst


class CachedFastParser(type):
    """ This is a metaclass for caching `FastParser`. """
    def __call__(self, grammar, source, module_path=None):
        if not settings.fast_parser:
            return ParserWithRecovery(grammar, source, module_path)

        pi = parser_cache.get(module_path, None)
        if pi is None or isinstance(pi.parser, ParserWithRecovery):
            p = super(CachedFastParser, self).__call__(grammar, source, module_path)
        else:
            p = pi.parser  # pi is a `cache.ParserCacheItem`
            p.update(source)
        return p


class ParserNode(object):
    def __init__(self, fast_module, parser, source):
        self._fast_module = fast_module
        self.parent = None
        self._node_children = []

        self.source = source
        self.hash = hash(source)
        self.parser = parser
        if source:
            self._end_pos = parser.module.end_pos
        else:
            self._end_pos = 1, 0

        try:
            # With fast_parser we have either 1 subscope or only statements.
            self._content_scope = parser.module.subscopes[0]
            # A parsed node's content will be in the first indent, because
            # everything that's parsed is within this subscope.
            self._is_class_or_def = True
        except IndexError:
            self._content_scope = parser.module
            self._is_class_or_def = False
        else:
            self._rewrite_last_newline()

        # We need to be able to reset the original children of a parser.
        self._old_children = list(self._content_scope.children)

    def is_root_node(self):
        return self.parent is None

    def _rewrite_last_newline(self):
        """
        The ENDMARKER can contain a newline in the prefix. However this prefix
        really belongs to the function - respectively to the next function or
        parser node. If we don't rewrite that newline, we end up with a newline
        in the wrong position, i.d. at the end of the file instead of in the
        middle.
        """
        c = self._content_scope.children
        if tree.is_node(c[-1], 'suite'):  # In a simple_stmt there's no DEDENT.
            end_marker = self.parser.module.children[-1]
            # Set the DEDENT prefix instead of the ENDMARKER.
            c[-1].children[-1].prefix = end_marker.prefix
            end_marker.prefix = ''

    def __repr__(self):
        module = self.parser.module
        try:
            return '<%s: %s-%s>' % (type(self).__name__, module.start_pos, module.end_pos)
        except IndexError:
            # There's no module yet.
            return '<%s: empty>' % type(self).__name__

    @property
    def end_pos(self):
        return self._end_pos[0] + self.parser.position_modifier.line, self._end_pos[1]

    def reset_node(self):
        """
        Removes changes that were applied in this class.
        """
        self._node_children = []
        scope = self._content_scope
        scope.children = list(self._old_children)
        try:
            # This works if it's a MergedNamesDict.
            # We are correcting it, because the MergedNamesDicts are artificial
            # and can change after closing a node.
            scope.names_dict = scope.names_dict.dicts[0]
        except AttributeError:
            pass

    def close(self):
        """
        Closes the current parser node. This means that after this no further
        nodes should be added anymore.
        """
        # We only need to replace the dict if multiple dictionaries are used:
        if self._node_children:
            dcts = [n.parser.module.names_dict for n in self._node_children]
            # Need to insert the own node as well.
            dcts.insert(0, self._content_scope.names_dict)
            self._content_scope.names_dict = MergedNamesDict(dcts)
            endmarker = self.parser.get_parsed_node().children[-1]
            assert endmarker.type == 'endmarker'
            last_parser = self._node_children[-1].parser
            endmarker.start_pos = last_parser.get_parsed_node().end_pos

    @property
    def _indent(self):
        if self.is_root_node():
            return 0

        return self.parser.module.children[0].start_pos[1]

    def add_node(self, node, start_line, indent):
        """
        Adding a node means adding a node that was either just parsed or one
        that can be reused.
        """
        # Content that is not a subscope can never be part of the current node,
        # because it's basically a sister node, that sits next to it and not
        # within it.
        if (self._indent >= indent or not self._is_class_or_def) and \
                not self.is_root_node():
            self.close()
            return self.parent.add_node(node, start_line, indent)

        # Changing the line offsets is very important, because if they don't
        # fit, all the start_pos values will be wrong.
        m = node.parser.module
        node.parser.position_modifier.line = start_line - 1
        self._fast_module.modules.append(m)
        node.parent = self

        self._node_children.append(node)

        # Insert parser objects into current structure. We only need to set the
        # parents and children in a good way.
        scope = self._content_scope
        for child in m.children:
            child.parent = scope
            scope.children.append(child)

        return node

    def all_sub_nodes(self):
        """
        Returns all nodes including nested ones.
        """
        for n in self._node_children:
            yield n
            for y in n.all_sub_nodes():
                yield y

    @underscore_memoization  # Should only happen once!
    def remove_last_newline(self):
        self.parser.remove_last_newline()


class FastParser(use_metaclass(CachedFastParser)):
    _FLOWS_NEED_SPACE = 'if', 'elif', 'while', 'with', 'except', 'for'
    _FLOWS_NEED_COLON = 'else', 'try', 'except', 'finally'
    _keyword_re = re.compile('^[ \t]*(def |class |@|(?:%s)|(?:%s)\s*:)'
                             % ('|'.join(_FLOWS_NEED_SPACE),
                                '|'.join(_FLOWS_NEED_COLON)))

    def __init__(self, grammar, source, module_path=None):
        # set values like `tree.Module`.
        self._grammar = grammar
        self.module_path = module_path
        self._reset_caches()
        self.update(source)

    def _reset_caches(self):
        self.module = FastModule(self.module_path)
        self.root_node = self.current_node = ParserNode(self.module, self, '')

    def get_parsed_node(self):
        return self.module

    def update(self, source):
        # Variables for testing purposes: It is important that the number of
        # parsers used can be minimized. With these variables we can test
        # against that.
        self.number_parsers_used = 0
        self.number_of_splits = 0
        self.number_of_misses = 0
        self.module.reset_caches()
        self.source = source
        try:
            self._parse(source)
        except:
            # FastParser is cached, be careful with exceptions.
            self._reset_caches()
            raise

    def _split_parts(self, source):
        """
        Split the source code into different parts. This makes it possible to
        parse each part seperately and therefore cache parts of the file and
        not everything.
        """
        def gen_part():
            text = ''.join(current_lines)
            del current_lines[:]
            self.number_of_splits += 1
            return text

        def just_newlines(current_lines):
            for line in current_lines:
                line = line.lstrip('\t \n\r')
                if line and line[0] != '#':
                    return False
            return True

        # Split only new lines. Distinction between \r\n is the tokenizer's
        # job.
        # It seems like there's no problem with form feed characters here,
        # because we're not counting lines.
        self._lines = source.splitlines(True)
        current_lines = []
        is_decorator = False
        # Use -1, because that indent is always smaller than any other.
        indent_list = [-1, 0]
        new_indent = False
        parentheses_level = 0
        flow_indent = None
        previous_line = None
        # All things within flows are simply being ignored.
        for i, l in enumerate(self._lines):
            # Handle backslash newline escaping.
            if l.endswith('\\\n') or l.endswith('\\\r\n'):
                if previous_line is not None:
                    previous_line += l
                else:
                    previous_line = l
                continue
            if previous_line is not None:
                l = previous_line + l
                previous_line = None

            # check for dedents
            s = l.lstrip('\t \n\r')
            indent = len(l) - len(s)
            if not s or s[0] == '#':
                current_lines.append(l)  # Just ignore comments and blank lines
                continue

            if new_indent and not parentheses_level:
                if indent > indent_list[-2]:
                    # Set the actual indent, not just the random old indent + 1.
                    indent_list[-1] = indent
                new_indent = False

            while indent < indent_list[-1]:  # -> dedent
                indent_list.pop()
                # This automatically resets the flow_indent if there was a
                # dedent or a flow just on one line (with one simple_stmt).
                new_indent = False
                if flow_indent is None and current_lines and not parentheses_level:
                    yield gen_part()
                flow_indent = None

            # Check lines for functions/classes and split the code there.
            if flow_indent is None:
                m = self._keyword_re.match(l)
                if m:
                    # Strip whitespace and colon from flows as a check.
                    if m.group(1).strip(' \t\r\n:') in FLOWS:
                        if not parentheses_level:
                            flow_indent = indent
                    else:
                        if not is_decorator and not just_newlines(current_lines):
                            yield gen_part()
                    is_decorator = '@' == m.group(1)
                    if not is_decorator:
                        parentheses_level = 0
                        # The new indent needs to be higher
                        indent_list.append(indent + 1)
                        new_indent = True
                elif is_decorator:
                    is_decorator = False

            parentheses_level = \
                max(0, (l.count('(') + l.count('[') + l.count('{') -
                        l.count(')') - l.count(']') - l.count('}')))

            current_lines.append(l)

        if previous_line is not None:
            current_lines.append(previous_line)
        if current_lines:
            yield gen_part()

    def _parse(self, source):
        """ :type source: str """
        added_newline = False
        if not source or source[-1] != '\n':
            # To be compatible with Pythons grammar, we need a newline at the
            # end. The parser would handle it, but since the fast parser abuses
            # the normal parser in various ways, we need to care for this
            # ourselves.
            source += '\n'
            added_newline = True

        next_code_part_end_line = code_part_end_line = 1
        start = 0
        nodes = list(self.root_node.all_sub_nodes())
        # Now we can reset the node, because we have all the old nodes.
        self.root_node.reset_node()
        self.current_node = self.root_node
        last_end_line = 1

        for code_part in self._split_parts(source):
            next_code_part_end_line += code_part.count('\n')
            # If the last code part parsed isn't equal to the current end_pos,
            # we know that the parser went further (`def` start in a
            # docstring). So just parse the next part.
            if code_part_end_line == last_end_line:
                self._parse_part(code_part, source[start:], code_part_end_line, nodes)
            else:
                self.number_of_misses += 1
                # Means that some lines where not fully parsed. Parse it now.
                # This is a very rare case. Should only happens with very
                # strange code bits.
                while last_end_line < next_code_part_end_line:
                    code_part_end_line = last_end_line
                    # We could calculate the src in a more complicated way to
                    # make caching here possible as well. However, this is
                    # complicated and error-prone. Since this is not very often
                    # called - just ignore it.
                    src = ''.join(self._lines[code_part_end_line - 1:])
                    self._parse_part(code_part, src, code_part_end_line, nodes)
                    last_end_line = self.current_node.end_pos[0]
                debug.dbg("While parsing %s, starting with line %s wasn't included in split.",
                          self.module_path, code_part_end_line)
                #assert code_part_end_line > last_end_line
                # This means that the parser parsed faster than the last given
                # `code_part`.
                debug.dbg('While parsing %s, line %s slowed down the fast parser.',
                          self.module_path, code_part_end_line)

            code_part_end_line = next_code_part_end_line
            start += len(code_part)

            last_end_line = self.current_node.end_pos[0]

        if added_newline:
            self.current_node.remove_last_newline()

        # Now that the for loop is finished, we still want to close all nodes.
        node = self.current_node
        while node is not None:
            node.close()
            node = node.parent

        debug.dbg('Parsed %s, with %s parsers in %s splits.'
                  % (self.module_path, self.number_parsers_used,
                     self.number_of_splits))

    def _parse_part(self, source, parser_code, code_part_end_line, nodes):
        """
        Side effect: Alters the list of nodes.
        """
        h = hash(source)
        for index, node in enumerate(nodes):
            if node.hash == h and node.source == source:
                node.reset_node()
                nodes.remove(node)
                parser_code = source
                break
        else:
            tokenizer = FastTokenizer(parser_code)
            self.number_parsers_used += 1
            p = ParserWithRecovery(self._grammar, parser_code, self.module_path, tokenizer=tokenizer)

            end = code_part_end_line - 1 + p.module.end_pos[0]
            used_lines = self._lines[code_part_end_line - 1:end - 1]
            code_part_actually_used = ''.join(used_lines)

            node = ParserNode(self.module, p, code_part_actually_used)

        indent = len(parser_code) - len(parser_code.lstrip('\t '))

        self.current_node.add_node(node, code_part_end_line, indent)
        self.current_node = node


class FastTokenizer(object):
    """
    Breaks when certain conditions are met, i.e. a new function or class opens.
    """
    def __init__(self, source):
        self.source = source
        self._gen = source_tokens(source, use_exact_op_types=True)
        self._closed = False

        # fast parser options
        self.current = self.previous = NEWLINE, '', (0, 0)
        self._in_flow = False
        self._is_decorator = False
        self._first_stmt = True
        self._parentheses_level = 0
        self._indent_counter = 0
        self._flow_indent_counter = 0
        self._returned_endmarker = False
        self._expect_indent = False

    def __iter__(self):
        return self

    def next(self):
        """ Python 2 Compatibility """
        return self.__next__()

    def __next__(self):
        if self._closed:
            return self._finish_dedents()

        typ, value, start_pos, prefix = current = next(self._gen)
        if typ == ENDMARKER:
            self._closed = True
            self._returned_endmarker = True
            return current

        self.previous = self.current
        self.current = current

        if typ == INDENT:
            self._indent_counter += 1
            if not self._expect_indent and not self._first_stmt and not self._in_flow:
                # This does not mean that there is an actual flow, it means
                # that the INDENT is syntactically wrong.
                self._flow_indent_counter = self._indent_counter - 1
                self._in_flow = True
            self._expect_indent = False
        elif typ == DEDENT:
            self._indent_counter -= 1
            if self._in_flow:
                if self._indent_counter == self._flow_indent_counter:
                    self._in_flow = False
            else:
                self._closed = True
            return current

        previous_type = self.previous[0]
        if value in ('def', 'class') and self._parentheses_level:
            # Account for the fact that an open parentheses before a function
            # will reset the parentheses counter, but new lines before will
            # still be ignored. So check the prefix.

            # TODO what about flow parentheses counter resets in the tokenizer?
            self._parentheses_level = 0
            # We need to simulate a newline before the indent, because the
            # open parentheses ignored them.
            if re.search('\n\s*', prefix):
                previous_type = NEWLINE

        # Parentheses ignore the indentation rules. The other three stand for
        # new lines.
        if previous_type in (NEWLINE, INDENT, DEDENT) \
                and not self._parentheses_level and typ not in (INDENT, DEDENT):
            if not self._in_flow:
                if value in FLOWS:
                    self._flow_indent_counter = self._indent_counter
                    self._first_stmt = False
                elif value in ('def', 'class', '@'):
                    # The values here are exactly the same check as in
                    # _split_parts, but this time with tokenize and therefore
                    # precise.
                    if not self._first_stmt and not self._is_decorator:
                        return self._close()

                    self._is_decorator = '@' == value
                    if not self._is_decorator:
                        self._first_stmt = False
                        self._expect_indent = True
                elif self._expect_indent:
                    return self._close()
                else:
                    self._first_stmt = False

        if value in '([{' and value:
            self._parentheses_level += 1
        elif value in ')]}' and value:
            # Ignore closing parentheses, because they are all
            # irrelevant for the indentation.
            self._parentheses_level = max(self._parentheses_level - 1, 0)
        return current

    def _close(self):
        if self._first_stmt:
            # Continue like nothing has happened, because we want to enter
            # the first class/function.
            if self.current[1] != '@':
                self._first_stmt = False
            return self.current
        else:
            self._closed = True
            return self._finish_dedents()

    def _finish_dedents(self):
        if self._indent_counter:
            self._indent_counter -= 1
            return DEDENT, '', self.current[2], ''
        elif not self._returned_endmarker:
            self._returned_endmarker = True
            return ENDMARKER, '', self.current[2], self._get_prefix()
        else:
            raise StopIteration

    def _get_prefix(self):
        """
        We're using the current prefix for the endmarker to not loose any
        information. However we care about "lost" lines. The prefix of the
        current line (indent) will always be included in the current line.
        """
        cur = self.current
        while cur[0] == DEDENT:
            cur = next(self._gen)
        prefix = cur[3]

        # \Z for the end of the string. $ is bugged, because it has the
        # same behavior with or without re.MULTILINE.
        return re.sub(r'[^\n]+\Z', '', prefix)