1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
|
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
# A fork of `parso.parser`.
# https://github.com/davidhalter/parso/blob/v0.3.4/parso/parser.py
#
# The following changes were made:
# - Typing was added.
# - Error recovery is removed.
# - The Jedi-specific _allowed_transition_names_and_token_types API is removed.
# - Improved error messages by using our exceptions module.
# - node_map/leaf_map were removed in favor of just calling convert_*.
# - convert_node/convert_leaf were renamed to convert_nonterminal/convert_terminal
# - convert_nonterminal is called regardless of the number of children. Parso avoids
# calling it in some cases to avoid creating extra nodes.
# - The parser is constructed with the tokens to allow us to track a bit more state. As
# As a consequence parser may only be used once.
# - Supports our custom Token class, instead of `parso.python.tokenize.Token`.
from dataclasses import dataclass, field
from typing import Generic, Iterable, List, Sequence, TypeVar, Union
from libcst._exceptions import (
EOFSentinel,
get_expected_str,
ParserSyntaxError,
PartialParserSyntaxError,
)
from libcst._parser.parso.pgen2.generator import DFAState, Grammar, ReservedString
from libcst._parser.parso.python.token import TokenType
from libcst._parser.types.token import Token
_NodeT = TypeVar("_NodeT")
_TokenTypeT = TypeVar("_TokenTypeT", bound=TokenType)
_TokenT = TypeVar("_TokenT", bound=Token)
@dataclass(frozen=False)
class StackNode(Generic[_TokenTypeT, _NodeT]):
dfa: "DFAState[_TokenTypeT]"
nodes: List[_NodeT] = field(default_factory=list)
@property
def nonterminal(self) -> str:
return self.dfa.from_rule
def _token_to_transition(
grammar: "Grammar[_TokenTypeT]", type_: _TokenTypeT, value: str
) -> Union[ReservedString, _TokenTypeT]:
# Map from token to label
if type_.contains_syntax:
# Check for reserved words (keywords)
try:
return grammar.reserved_syntax_strings[value]
except KeyError:
pass
return type_
# TODO: This should be an ABC, but there's a metaclass conflict between Generic and ABC
# that's fixed in Python 3.7.
class BaseParser(Generic[_TokenT, _TokenTypeT, _NodeT]):
"""Parser engine.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See python/tokenize.py for how to get input tokens by a string.
"""
tokens: Iterable[_TokenT]
lines: Sequence[str] # used when generating parse errors
_pgen_grammar: "Grammar[_TokenTypeT]"
stack: List[StackNode[_TokenTypeT, _NodeT]]
# Keep track of if parse was called. Because a parser may keep global mutable state,
# each BaseParser instance should only be used once.
__was_parse_called: bool
def __init__(
self,
*,
tokens: Iterable[_TokenT],
lines: Sequence[str],
pgen_grammar: "Grammar[_TokenTypeT]",
start_nonterminal: str,
) -> None:
self.tokens = tokens
self.lines = lines
self._pgen_grammar = pgen_grammar
first_dfa = pgen_grammar.nonterminal_to_dfas[start_nonterminal][0]
self.stack = [StackNode(first_dfa)]
self.__was_parse_called = False
def parse(self) -> _NodeT:
# Ensure that we don't re-use parsers.
if self.__was_parse_called:
raise Exception("Each parser object may only be used to parse once.")
self.__was_parse_called = True
for token in self.tokens:
self._add_token(token)
while True:
tos = self.stack[-1]
if not tos.dfa.is_final:
expected_str = get_expected_str(
EOFSentinel.EOF, tos.dfa.transitions.keys()
)
raise ParserSyntaxError(
f"Incomplete input. {expected_str}",
lines=self.lines,
raw_line=len(self.lines),
raw_column=len(self.lines[-1]),
)
if len(self.stack) > 1:
self._pop()
else:
return self.convert_nonterminal(tos.nonterminal, tos.nodes)
def convert_nonterminal(
self, nonterminal: str, children: Sequence[_NodeT]
) -> _NodeT:
...
def convert_terminal(self, token: _TokenT) -> _NodeT:
...
def _add_token(self, token: _TokenT) -> None:
"""
This is the only core function for parsing. Here happens basically
everything. Everything is well prepared by the parser generator and we
only apply the necessary steps here.
"""
grammar = self._pgen_grammar
stack = self.stack
# pyre-fixme[6]: Expected `_TokenTypeT` for 2nd param but got `TokenType`.
transition = _token_to_transition(grammar, token.type, token.string)
while True:
try:
plan = stack[-1].dfa.transitions[transition]
break
except KeyError:
if stack[-1].dfa.is_final:
try:
self._pop()
except PartialParserSyntaxError as ex:
# Upconvert the PartialParserSyntaxError to a ParserSyntaxError
# by backfilling the line/column information.
raise ParserSyntaxError(
ex.message,
lines=self.lines,
raw_line=token.start_pos[0],
raw_column=token.start_pos[1],
)
except Exception as ex:
# convert_nonterminal may fail due to a bug in our code. Try to
# recover enough to at least tell us where in the file it
# failed.
raise ParserSyntaxError(
f"Internal error: {ex}",
lines=self.lines,
raw_line=token.start_pos[0],
raw_column=token.start_pos[1],
)
else:
# We never broke out -- EOF is too soon -- Unfinished statement.
#
# BUG: The `expected_str` may not be complete because we already
# popped the other possibilities off the stack at this point, but
# it still seems useful to list some of the possibilities that we
# could've expected.
expected_str = get_expected_str(
token, stack[-1].dfa.transitions.keys()
)
raise ParserSyntaxError(
f"Incomplete input. {expected_str}",
lines=self.lines,
raw_line=token.start_pos[0],
raw_column=token.start_pos[1],
)
except IndexError:
# I don't think this will ever happen with Python's grammar, because if
# there are any extra tokens at the end of the input, we'll instead
# complain that we expected ENDMARKER.
#
# However, let's leave it just in case.
expected_str = get_expected_str(token, EOFSentinel.EOF)
raise ParserSyntaxError(
f"Too much input. {expected_str}",
lines=self.lines,
raw_line=token.start_pos[0],
raw_column=token.start_pos[1],
)
# Logically, `plan` is always defined, but pyre can't reasonably determine that.
stack[-1].dfa = plan.next_dfa
for push in plan.dfa_pushes:
stack.append(StackNode(push))
leaf = self.convert_terminal(token)
stack[-1].nodes.append(leaf)
def _pop(self) -> None:
tos = self.stack.pop()
# Unlike parso and lib2to3, we call `convert_nonterminal` unconditionally
# instead of only when we have more than one child. This allows us to create a
# far more consistent and predictable tree.
new_node = self.convert_nonterminal(tos.dfa.from_rule, tos.nodes)
self.stack[-1].nodes.append(new_node)
|