# type: ignore """Contains a class that compiles and stores all the information that is relevant to the analysis of SyntaxErrors. """ from .. import debug_helper, token_utils from ..source_cache import cache from .syntax_utils import matching_brackets # During the analysis for finding the cause of the error, we typically examine # a "bad token" identified by Python as the cause of the error and often # look at its two neighbours. If the bad token is the first one in a statement # it does not have a token preceding it; if it is the last one, it does not # have a token following it. By assigning a meaningless token value to these # neighbours, the code for the analysis can be greatly simplified as we do # not have to verify the existence of these neighbours. MEANINGLESS_TOKEN = token_utils.tokenize(" ")[0] # fmt: off LINE_NUMBER = " {:%d}| " # noqa MARKED_LINE_NUMBER = " -->{:%d}| " # noqa LINE_GAP = " (...)" # noqa # fmt: on class Statement: """Instances of this class contain all relevant information required for the various functions that attempt to determine the cause of SyntaxError. One main idea is to retrieve a "complete statement" where the exception is raised. By "complete statement", we mean the smallest number of consecutive lines of code that contains the line where the exception was raised and includes matching pairs of brackets, (), [], {}. If a problem arises due to non-matching pairs of brackets, this information is available (variables begin_brackets or end_bracket). A complete statement is saved as a list of tokens (self.statement_tokens) from which it could be reconstructed using an untokenize function. From this list of tokens, a secondary list is obtained (self.tokens) by removing all space-like and comment tokens; this secondary list thus includes the only meaningful tokens needed to do the analysis. To simplify the code doing the error analysis itself, we precompute various parameters (e.g. does the statement fits on a single line, how many meaningful tokens are included, what are the first and last tokens on that statement, etc.) which are needed for some functions. """ def __init__(self, value, bad_line, original_bad_line): # The basic information given by a SyntaxError self.filename = value.filename self.linenumber = value.lineno self.message = value.msg self.offset = value.offset self.value = value # can be useful when debugging with the repl # Python 3.10 introduced new attributes for 'value'. # We already have taken care of assigning some default values # to these in core.py. self.end_offset = value.end_offset self.end_linenumber = value.end_lineno if self.end_offset is None or self.offset is None: self.highlighted_tokens = None elif self.end_offset - self.offset == 1: self.highlighted_tokens = None else: self.highlighted_tokens = [] # From the traceback, we were previously able ot obtain the line # of code identified by Python as being problematic. self.bad_line = bad_line # previously obtained from the traceback self.original_bad_line = original_bad_line # skipcq: PTC-W0052 self.entire_statement = original_bad_line # temporary assignment # The following will be obtained using offset and bad_line self.bad_token = None self.bad_token_index = 0 self.prev_token = None # meaningful token preceding bad token self.next_token = None # meaningful token following bad token self.bad_token_comment = None # SyntaxError produced inside f-strings occasionally require a special treatment self.fstring_error = self.filename == "" or "f-string" in self.message self.all_statements = [] # useful to determine what lines to include in # the contextual display # statement_tokens includes all tokens, including newlines, etc., needed self.statement_tokens = [] # for proper reconstruction of multiline statements self.tokens = [] # meaningful tokens, used for error analysis; see docstring self.nb_tokens = 0 # number of meaningful tokens self.formatted_partial_source = "" self.source_lines = [] # lines of code for the source self.statement_brackets = [] # keep track of ([{ anywhere in a statement self.begin_brackets = [] # unclosed ([{ before bad token self.end_bracket = None # single unmatched )]} self.first_token = None # meaningful token self.last_token = None # meaningful token # The following is used to indicate the position of ^ and other # symbols when using where() self.location_markers = {} # When using the friendly console (repl), SyntaxError might prevent # closing all brackets to complete a statement. Knowing this can be # useful during the error analysis. self.using_friendly_console = False if self.filename is not None: self.using_friendly_console = self.filename.startswith(" 0: self.statement_tokens = self.all_statements[current_statement_index] current_statement_index -= 1 self.tokens = self.remove_meaningless_tokens() if self.tokens: break else: self.tokens = [token_utils.tokenize("Internal_error")[0]] self.entire_statement = token_utils.untokenize(self.statement_tokens) if ( self.filename.startswith("= 1: self.first_token = self.tokens[0] self.last_token = self.tokens[-1] if self.bad_token is None: self.bad_token = self.last_token self.bad_token_index = self.nb_tokens - 1 if self.bad_token_index == 0: self.prev_token = MEANINGLESS_TOKEN else: self.prev_token = self.tokens[self.bad_token_index - 1] if self.bad_token_index == 0: self.prev_token = MEANINGLESS_TOKEN elif self.prev_token is None: # pragma: no cover debug_helper.log("This case should be added as new test.") self.prev_token = self.tokens[self.bad_token_index - 1] if self.last_token != self.bad_token: self.next_token = self.tokens[self.bad_token_index + 1] else: self.next_token = MEANINGLESS_TOKEN def format_statement(self): """Format the statement identified as causing the problem and possibly a couple of preceding statements, showing the line number and token identified. """ # Some errors, like "Too many statistically nested blocs" prevent # Python from making a source available. if not (self.statement_tokens or self.original_bad_line.strip("\n")): self.formatted_partial_source = "" return # In some IndentationError cases, and possibly others, # Python shows the ^ just before the first token # Note that start_row from tokenize and self.offset differ in their origins. offset_col = self.tokens[-1].start_col if self.offset == offset_col: self.offset += 1 self.end_offset += 1 if self.statement_tokens: lines = self.get_lines_to_show() else: lines = [(self.linenumber, self.original_bad_line.strip("\n"))] self.annotate_lines(lines) def get_lines_to_show(self): """Restricts the lines of code to be included when showing the location of the error. """ self.create_location_markers() first_token = self.all_statements[0][0] lines = get_lines_from_statement( self.statement_tokens, self.linenumber, first_token ) if lines: last_linenumber_included, _ = lines[-1] if self.linenumber > last_linenumber_included: # The problem line was an empty line lines.append((last_linenumber_included + 1, "")) return lines def annotate_lines(self, lines): """Adds the caret marks used to show the location of the error""" # Lines is a list of tuples, each tuple has the line number # as a first item, and the text of the line as the second. if lines: # Ensures that we align all the lines vertically on a colon # following the line number, even if the number of digits # changes, i.e. we include lines 8, 9, 10, 11. nb_digits = len(str(lines[-1][0])) else: debug_helper.log("No line to annotate in annotate_lines") self.formatted_partial_source = "\n" return no_mark = LINE_NUMBER % nb_digits with_mark = MARKED_LINE_NUMBER % nb_digits if len(lines) > 1 else no_mark leading_spaces = " " * (len(LINE_NUMBER % lines[-1][0]) - 3) new_lines = [] for i, line in lines: if i in self.location_markers: num = with_mark.format(i) new_lines.append(num + line) new_lines.append(leading_spaces + self.location_markers[i]) else: num = no_mark.format(i) new_lines.append(num + line) self.formatted_partial_source = "\n".join(new_lines) def create_location_markers(self): # In some cases, the location markers are determined while analysing # the statement to find the cause. if self.location_markers: return # We generally go with the information obtained by Python. # although, sometimes it might be off by 1. nb_carets = 1 # Python default diff = self.bad_token.start_col - self.offset self.offset += diff nb_carets = 0 if self.highlighted_tokens: last_token = self.highlighted_tokens[-1] if last_token.is_comment() and not (last_token is self.bad_token): last_token = self.highlighted_tokens[-2] if self.end_offset is not None: if self.end_offset < last_token.end_col: nb_carets = self.end_offset - self.bad_token.start_col else: nb_carets = last_token.end_col - self.bad_token.start_col # Highlight at least the entire bad token for friendly nb_carets = max(len(self.bad_token.string), nb_carets) offset_mark = " " * self.offset + "^" * nb_carets self.location_markers = {self.linenumber: offset_mark} def obtain_statement(self, source_tokens): """This method scans the source searching for the statement that caused the problem. Most often, it will be a single line of code. However, it might occasionally be a multiline statement that includes code surrounded by some brackets spanning multiple lines. It will set the following: - self.statement_tokens: a list of all the tokens in the problem statement - self.bad_token: the token identified as causing the problem based on offset. - self.statement_brackets: a list of open brackets '([{' not yet closed - self.begin_brackets: a list of open brackets '([{' in a statement before the bad token. - self.end_bracket: an unmatched closing bracket ')]}' signaling an error - self.all_statements: list of all individual identified statements up to and including the problem statement. """ previous_row = -1 previous_token = None continuation_line = False # Some tokens cannot occur within brackets; if they are indicated as being # the offending token, it might be because we have an unclosed bracket. should_begin_statement = [ "assert", "async", "await", "break", "class", "continue", "def", "del", "elif", "except", "finally", "global", "import", # cannot write: from (x import ... "nonlocal", "pass", "raise", "return", "try", "with", "while", "yield", ] last_line_to_include = self.linenumber for token in source_tokens: if ( token.start_row > last_line_to_include and not continuation_line and not self.statement_brackets ): break # is this a new statement? # Valid statements will have matching brackets (), {}, []. # A new statement will typically start on a new line and will be # preceded by valid statements. # An initial version was based on the assumption that any semicolon # would be used correctly and would indicate the end of a statement; # however, I am guessing that it more likely indicates # a typo, and that the user wanted to write a comma or a colon, so I # do not treat them in any special way. if token.is_unclosed_string(): token.string = token.string.rstrip() self.statement_tokens.append(token) self.linenumber = token.start_row break if token.start_row > previous_row: if previous_token is not None: continuation_line = previous_token.line.endswith("\\\n") if ( token.start_row <= last_line_to_include and not self.statement_brackets ): if self.statement_tokens: self.all_statements.append(self.statement_tokens[:]) self.statement_tokens = [] self.begin_brackets = [] if token.start_row > last_line_to_include and self.statement_brackets: # The statement continues beyond the line with the error, so # we make sure to include this additional line. last_line_to_include = token.start_row previous_row = token.start_row self.statement_tokens.append(token) # The offset seems to be different depending on Python versions, # sometimes matching the beginning of a token, sometimes the end. # Furthermore, the end of a token (end_col) might be equal to # the beginning of the next (start_col). # Additionally, for Python 3.10, multiple tokens can be highlighted # if they are on the same line. if self.highlighted_tokens: # not None and not empty list if ( self.linenumber == token.start_row and self.end_offset is not None and ( self.offset < token.end_col < self.end_offset or self.end_offset == 0 ) and token.string.strip() ): self.highlighted_tokens.append(token) elif ( token.start_row == last_line_to_include and token.start_col <= self.offset <= token.end_col and self.bad_token is None and token.string.strip() ): self.bad_token = token if self.bad_token.is_comment(): self.bad_token_comment = self.bad_token self.bad_token = self.prev_token if self.highlighted_tokens is not None: self.highlighted_tokens.append(self.bad_token) elif ( token.string.strip() and not token.is_comment() and self.bad_token is None ): self.prev_token = token previous_token = token if ( self.bad_token in should_begin_statement and self.bad_token != self.statement_tokens[0] and self.statement_brackets ): break # we almost certainly have an unclosed bracket # Note: '' in 'any string' == True # careful to not accidentally include null strings as brackets if not token.string or token.string not in "()[]}{": continue if token.string in "([{": self.statement_brackets.append(token.string) if self.bad_token is None or self.bad_token is token: self.begin_brackets.append(token) elif token.string in ")]}": self.end_bracket = token if not self.statement_brackets: break open_bracket = self.statement_brackets.pop() if not matching_brackets(open_bracket, token.string): self.statement_brackets.append(open_bracket) break if self.begin_brackets and self.bad_token is None: self.begin_brackets.pop() self.end_bracket = None if self.statement_tokens: # Protecting against EOF while parsing last_line = token_utils.untokenize(self.statement_tokens) if last_line.strip(): self.all_statements.append(self.statement_tokens) elif self.all_statements: self.statement_tokens = self.all_statements[-1] def remove_meaningless_tokens(self): """Given a list of tokens, remove all space-like tokens and comments; also assign the index value of the bad token. """ index = 0 tokens = [] for tok in self.statement_tokens: if not tok.string.strip() or tok.is_comment(): continue tokens.append(tok) if tok is self.bad_token: self.bad_token_index = index index += 1 return tokens def get_lines_from_statement(tokens, linenumber, prev_token): lines = [] end_docstring = False for token in tokens: current_linenumber = token.start_row current_line = token.line.rstrip() if lines and prev_token.end_row == current_linenumber: continue if linenumber < current_linenumber and token == tokens[0]: break if linenumber - current_linenumber < 5: if end_docstring == current_line: end_docstring = False continue if "\n" in current_line: text = current_line.split("\n") # handle docstring for line in text: lines.append((current_linenumber, line)) current_linenumber += 1 end_docstring = text[-1] current_linenumber -= 1 else: lines.append((current_linenumber, current_line)) else: if "\n" in current_line: text = current_line.split("\n") end_docstring = text[-1] prev_token = token return lines