1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
|
# type: ignore
"""Contains a class that compiles and stores all the information that
is relevant to the analysis of SyntaxErrors.
"""
from .. import debug_helper, token_utils
from ..source_cache import cache
from .syntax_utils import matching_brackets
# During the analysis for finding the cause of the error, we typically examine
# a "bad token" identified by Python as the cause of the error and often
# look at its two neighbours. If the bad token is the first one in a statement
# it does not have a token preceding it; if it is the last one, it does not
# have a token following it. By assigning a meaningless token value to these
# neighbours, the code for the analysis can be greatly simplified as we do
# not have to verify the existence of these neighbours.
MEANINGLESS_TOKEN = token_utils.tokenize(" ")[0]
# fmt: off
LINE_NUMBER = " {:%d}| " # noqa
MARKED_LINE_NUMBER = " -->{:%d}| " # noqa
LINE_GAP = " (...)" # noqa
# fmt: on
class Statement:
"""Instances of this class contain all relevant information required
for the various functions that attempt to determine the cause of
SyntaxError.
One main idea is to retrieve a "complete statement" where the
exception is raised. By "complete statement", we mean the smallest
number of consecutive lines of code that contains the line where
the exception was raised and includes matching pairs of brackets,
(), [], {}. If a problem arises due to non-matching pairs of brackets,
this information is available (variables begin_brackets or end_bracket).
A complete statement is saved as a list of tokens (self.statement_tokens)
from which it could be reconstructed using an untokenize function.
From this list of tokens, a secondary list is obtained (self.tokens) by
removing all space-like and comment tokens; this secondary list
thus includes the only meaningful tokens needed to do the analysis.
To simplify the code doing the error analysis itself, we precompute various
parameters (e.g. does the statement fits on a single line, how many
meaningful tokens are included, what are the first and last tokens
on that statement, etc.) which are needed for some functions.
"""
def __init__(self, value, bad_line, original_bad_line):
# The basic information given by a SyntaxError
self.filename = value.filename
self.linenumber = value.lineno
self.message = value.msg
self.offset = value.offset
self.value = value # can be useful when debugging with the repl
# Python 3.10 introduced new attributes for 'value'.
# We already have taken care of assigning some default values
# to these in core.py.
self.end_offset = value.end_offset
self.end_linenumber = value.end_lineno
if self.end_offset is None or self.offset is None:
self.highlighted_tokens = None
elif self.end_offset - self.offset == 1:
self.highlighted_tokens = None
else:
self.highlighted_tokens = []
# From the traceback, we were previously able ot obtain the line
# of code identified by Python as being problematic.
self.bad_line = bad_line # previously obtained from the traceback
self.original_bad_line = original_bad_line
# skipcq: PTC-W0052
self.entire_statement = original_bad_line # temporary assignment
# The following will be obtained using offset and bad_line
self.bad_token = None
self.bad_token_index = 0
self.prev_token = None # meaningful token preceding bad token
self.next_token = None # meaningful token following bad token
self.bad_token_comment = None
# SyntaxError produced inside f-strings occasionally require a special treatment
self.fstring_error = self.filename == "<fstring>" or "f-string" in self.message
self.all_statements = [] # useful to determine what lines to include in
# the contextual display
# statement_tokens includes all tokens, including newlines, etc., needed
self.statement_tokens = [] # for proper reconstruction of multiline statements
self.tokens = [] # meaningful tokens, used for error analysis; see docstring
self.nb_tokens = 0 # number of meaningful tokens
self.formatted_partial_source = ""
self.source_lines = [] # lines of code for the source
self.statement_brackets = [] # keep track of ([{ anywhere in a statement
self.begin_brackets = [] # unclosed ([{ before bad token
self.end_bracket = None # single unmatched )]}
self.first_token = None # meaningful token
self.last_token = None # meaningful token
# The following is used to indicate the position of ^ and other
# symbols when using where()
self.location_markers = {}
# When using the friendly console (repl), SyntaxError might prevent
# closing all brackets to complete a statement. Knowing this can be
# useful during the error analysis.
self.using_friendly_console = False
if self.filename is not None:
self.using_friendly_console = self.filename.startswith("<friendly")
elif not self.exceptional_case(): # pragma: no cover
debug_helper.log("filename is None in source_info.Statement")
debug_helper.log("Add this as a new test.")
self.get_token_info()
def exceptional_case(self):
"""Returns True if a case that is known to not have all the required information
available, such as filename or linenumber."""
return (
"too many statically nested blocks" in self.message
or "encoding problem" in self.message
)
def get_token_info(self):
"""Obtain all the relevant information about the tokens in
the file, with particular emphasis on the tokens belonging
to the statement where the error is located.
"""
if self.linenumber is not None:
source_tokens = self.get_source_tokens()
# self.all_statements and self.statement_tokens are set in the following
self.obtain_statement(source_tokens)
self.tokens = self.remove_meaningless_tokens()
current_statement_index = len(self.all_statements) - 1
if not self.tokens:
while current_statement_index > 0:
self.statement_tokens = self.all_statements[current_statement_index]
current_statement_index -= 1
self.tokens = self.remove_meaningless_tokens()
if self.tokens:
break
else:
self.tokens = [token_utils.tokenize("Internal_error")[0]]
self.entire_statement = token_utils.untokenize(self.statement_tokens)
if (
self.filename.startswith("<friendly-console")
and self.statement_brackets
and not self.end_bracket
): # pragma: no cover
# We got an error flagged before we had the chance to close
# brackets. Unclosed brackets should not be a problem on their
# own in a console session - so, we make sure to close
# the brackets in order to be able to find the true cause
# of the error
add_token = ""
brackets = self.statement_brackets.copy()
while brackets:
bracket = brackets.pop()
if bracket == "(":
add_token += ")"
elif bracket == "[":
add_token += "]"
else:
add_token += "}"
if self.tokens[0].string in [
"class",
"def",
"if",
"elif",
"while",
"for",
"except",
"with",
]:
add_token += ":"
last_token = self.tokens[-1]
last_token = last_token.copy()
last_token.start_row += 1
last_token.string = last_token.line = add_token
self.tokens.append(last_token)
elif not self.exceptional_case(): # pragma: no cover
debug_helper.log("linenumber is None in source_info.Statement")
debug_helper.log("Add this as new test case")
if self.tokens:
self.assign_individual_token_values()
if self.tokens[-1].is_unclosed_string():
self.bad_line = self.entire_statement
elif not self.exceptional_case(): # pragma: no cover
debug_helper.log("No meaningful tokens in source_info.Statement")
def get_source_tokens(self):
"""Returns a list containing all the tokens from the source."""
source = ""
if "f-string: invalid syntax" in self.message:
source = self.bad_line
try:
exec(self.bad_line)
except SyntaxError as e:
self.offset = e.offset
self.linenumber = 1
if not source.strip():
self.source_lines = cache.get_source_lines(self.filename)
if self.source_lines == ["\n"]:
self.source_lines = ["...\n"] * (self.linenumber - 1)
self.source_lines.append(self.bad_line)
source = "".join(self.source_lines)
if not source.strip():
source = self.bad_line or "\n"
return token_utils.tokenize(source)
def assign_individual_token_values(self):
"""Assign values of previous and next to bad token and other
related values.
"""
self.nb_tokens = len(self.tokens)
if self.nb_tokens >= 1:
self.first_token = self.tokens[0]
self.last_token = self.tokens[-1]
if self.bad_token is None:
self.bad_token = self.last_token
self.bad_token_index = self.nb_tokens - 1
if self.bad_token_index == 0:
self.prev_token = MEANINGLESS_TOKEN
else:
self.prev_token = self.tokens[self.bad_token_index - 1]
if self.bad_token_index == 0:
self.prev_token = MEANINGLESS_TOKEN
elif self.prev_token is None: # pragma: no cover
debug_helper.log("This case should be added as new test.")
self.prev_token = self.tokens[self.bad_token_index - 1]
if self.last_token != self.bad_token:
self.next_token = self.tokens[self.bad_token_index + 1]
else:
self.next_token = MEANINGLESS_TOKEN
def format_statement(self):
"""Format the statement identified as causing the problem and possibly
a couple of preceding statements, showing the line number and token identified.
"""
# Some errors, like "Too many statistically nested blocs" prevent
# Python from making a source available.
if not (self.statement_tokens or self.original_bad_line.strip("\n")):
self.formatted_partial_source = ""
return
# In some IndentationError cases, and possibly others,
# Python shows the ^ just before the first token
# Note that start_row from tokenize and self.offset differ in their origins.
offset_col = self.tokens[-1].start_col
if self.offset == offset_col:
self.offset += 1
self.end_offset += 1
if self.statement_tokens:
lines = self.get_lines_to_show()
else:
lines = [(self.linenumber, self.original_bad_line.strip("\n"))]
self.annotate_lines(lines)
def get_lines_to_show(self):
"""Restricts the lines of code to be included when showing the location
of the error.
"""
self.create_location_markers()
first_token = self.all_statements[0][0]
lines = get_lines_from_statement(
self.statement_tokens, self.linenumber, first_token
)
if lines:
last_linenumber_included, _ = lines[-1]
if self.linenumber > last_linenumber_included:
# The problem line was an empty line
lines.append((last_linenumber_included + 1, ""))
return lines
def annotate_lines(self, lines):
"""Adds the caret marks used to show the location of the error"""
# Lines is a list of tuples, each tuple has the line number
# as a first item, and the text of the line as the second.
if lines:
# Ensures that we align all the lines vertically on a colon
# following the line number, even if the number of digits
# changes, i.e. we include lines 8, 9, 10, 11.
nb_digits = len(str(lines[-1][0]))
else:
debug_helper.log("No line to annotate in annotate_lines")
self.formatted_partial_source = "\n"
return
no_mark = LINE_NUMBER % nb_digits
with_mark = MARKED_LINE_NUMBER % nb_digits if len(lines) > 1 else no_mark
leading_spaces = " " * (len(LINE_NUMBER % lines[-1][0]) - 3)
new_lines = []
for i, line in lines:
if i in self.location_markers:
num = with_mark.format(i)
new_lines.append(num + line)
new_lines.append(leading_spaces + self.location_markers[i])
else:
num = no_mark.format(i)
new_lines.append(num + line)
self.formatted_partial_source = "\n".join(new_lines)
def create_location_markers(self):
# In some cases, the location markers are determined while analysing
# the statement to find the cause.
if self.location_markers:
return
# We generally go with the information obtained by Python.
# although, sometimes it might be off by 1.
nb_carets = 1 # Python default
diff = self.bad_token.start_col - self.offset
self.offset += diff
nb_carets = 0
if self.highlighted_tokens:
last_token = self.highlighted_tokens[-1]
if last_token.is_comment() and not (last_token is self.bad_token):
last_token = self.highlighted_tokens[-2]
if self.end_offset is not None:
if self.end_offset < last_token.end_col:
nb_carets = self.end_offset - self.bad_token.start_col
else:
nb_carets = last_token.end_col - self.bad_token.start_col
# Highlight at least the entire bad token for friendly
nb_carets = max(len(self.bad_token.string), nb_carets)
offset_mark = " " * self.offset + "^" * nb_carets
self.location_markers = {self.linenumber: offset_mark}
def obtain_statement(self, source_tokens):
"""This method scans the source searching for the statement that
caused the problem. Most often, it will be a single line of code.
However, it might occasionally be a multiline statement that
includes code surrounded by some brackets spanning multiple lines.
It will set the following:
- self.statement_tokens: a list of all the tokens in the problem statement
- self.bad_token: the token identified as causing the problem based on offset.
- self.statement_brackets: a list of open brackets '([{' not yet closed
- self.begin_brackets: a list of open brackets '([{' in a statement
before the bad token.
- self.end_bracket: an unmatched closing bracket ')]}' signaling an error
- self.all_statements: list of all individual identified statements up to
and including the problem statement.
"""
previous_row = -1
previous_token = None
continuation_line = False
# Some tokens cannot occur within brackets; if they are indicated as being
# the offending token, it might be because we have an unclosed bracket.
should_begin_statement = [
"assert",
"async",
"await",
"break",
"class",
"continue",
"def",
"del",
"elif",
"except",
"finally",
"global",
"import", # cannot write: from (x import ...
"nonlocal",
"pass",
"raise",
"return",
"try",
"with",
"while",
"yield",
]
last_line_to_include = self.linenumber
for token in source_tokens:
if (
token.start_row > last_line_to_include
and not continuation_line
and not self.statement_brackets
):
break
# is this a new statement?
# Valid statements will have matching brackets (), {}, [].
# A new statement will typically start on a new line and will be
# preceded by valid statements.
# An initial version was based on the assumption that any semicolon
# would be used correctly and would indicate the end of a statement;
# however, I am guessing that it more likely indicates
# a typo, and that the user wanted to write a comma or a colon, so I
# do not treat them in any special way.
if token.is_unclosed_string():
token.string = token.string.rstrip()
self.statement_tokens.append(token)
self.linenumber = token.start_row
break
if token.start_row > previous_row:
if previous_token is not None:
continuation_line = previous_token.line.endswith("\\\n")
if (
token.start_row <= last_line_to_include
and not self.statement_brackets
):
if self.statement_tokens:
self.all_statements.append(self.statement_tokens[:])
self.statement_tokens = []
self.begin_brackets = []
if token.start_row > last_line_to_include and self.statement_brackets:
# The statement continues beyond the line with the error, so
# we make sure to include this additional line.
last_line_to_include = token.start_row
previous_row = token.start_row
self.statement_tokens.append(token)
# The offset seems to be different depending on Python versions,
# sometimes matching the beginning of a token, sometimes the end.
# Furthermore, the end of a token (end_col) might be equal to
# the beginning of the next (start_col).
# Additionally, for Python 3.10, multiple tokens can be highlighted
# if they are on the same line.
if self.highlighted_tokens: # not None and not empty list
if (
self.linenumber == token.start_row
and self.end_offset is not None
and (
self.offset < token.end_col < self.end_offset
or self.end_offset == 0
)
and token.string.strip()
):
self.highlighted_tokens.append(token)
elif (
token.start_row == last_line_to_include
and token.start_col <= self.offset <= token.end_col
and self.bad_token is None
and token.string.strip()
):
self.bad_token = token
if self.bad_token.is_comment():
self.bad_token_comment = self.bad_token
self.bad_token = self.prev_token
if self.highlighted_tokens is not None:
self.highlighted_tokens.append(self.bad_token)
elif (
token.string.strip()
and not token.is_comment()
and self.bad_token is None
):
self.prev_token = token
previous_token = token
if (
self.bad_token in should_begin_statement
and self.bad_token != self.statement_tokens[0]
and self.statement_brackets
):
break # we almost certainly have an unclosed bracket
# Note: '' in 'any string' == True
# careful to not accidentally include null strings as brackets
if not token.string or token.string not in "()[]}{":
continue
if token.string in "([{":
self.statement_brackets.append(token.string)
if self.bad_token is None or self.bad_token is token:
self.begin_brackets.append(token)
elif token.string in ")]}":
self.end_bracket = token
if not self.statement_brackets:
break
open_bracket = self.statement_brackets.pop()
if not matching_brackets(open_bracket, token.string):
self.statement_brackets.append(open_bracket)
break
if self.begin_brackets and self.bad_token is None:
self.begin_brackets.pop()
self.end_bracket = None
if self.statement_tokens: # Protecting against EOF while parsing
last_line = token_utils.untokenize(self.statement_tokens)
if last_line.strip():
self.all_statements.append(self.statement_tokens)
elif self.all_statements:
self.statement_tokens = self.all_statements[-1]
def remove_meaningless_tokens(self):
"""Given a list of tokens, remove all space-like tokens and comments;
also assign the index value of the bad token.
"""
index = 0
tokens = []
for tok in self.statement_tokens:
if not tok.string.strip() or tok.is_comment():
continue
tokens.append(tok)
if tok is self.bad_token:
self.bad_token_index = index
index += 1
return tokens
def get_lines_from_statement(tokens, linenumber, prev_token):
lines = []
end_docstring = False
for token in tokens:
current_linenumber = token.start_row
current_line = token.line.rstrip()
if lines and prev_token.end_row == current_linenumber:
continue
if linenumber < current_linenumber and token == tokens[0]:
break
if linenumber - current_linenumber < 5:
if end_docstring == current_line:
end_docstring = False
continue
if "\n" in current_line:
text = current_line.split("\n") # handle docstring
for line in text:
lines.append((current_linenumber, line))
current_linenumber += 1
end_docstring = text[-1]
current_linenumber -= 1
else:
lines.append((current_linenumber, current_line))
else:
if "\n" in current_line:
text = current_line.split("\n")
end_docstring = text[-1]
prev_token = token
return lines
|