1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
|
"""A collection of useful functions and methods to deal with tokenizing
source code.
"""
# Starting with Python 3.12, the tokenizer was completely rewritten to handle more general
# f-strings. The following warning was added to the documentation:
#
# Warning Note that the functions in this module are only designed to parse
# syntactically valid Python code (code that does not raise when parsed using ast.parse()).
# The behavior of the functions in this module is undefined when providing invalid Python code
# and it can change at any point.
#
# In order to provide more precise help when user code contains SyntaxErrors, we are left
# to deal in an ad-hoc manner to retrieve some invalid tokens.
#
import ast
import keyword
import sys
import tokenize as py_tokenize
from io import StringIO
from typing import Any, Iterable, List, Sequence, Tuple, Union
from . import debug_helper
_TokenInfo = Union[
py_tokenize.TokenInfo, Tuple[int, str, Tuple[int, int], Tuple[int, int], str]
]
_token_format = "type={type} string={string} start={start} end={end} line={line}"
UNCLOSED = -9
assert UNCLOSED not in py_tokenize.tok_name
py_tokenize.tok_name[UNCLOSED] = "UNCLOSED_STRING"
class Token:
"""Token as generated from Python's tokenize.generate_tokens written here in
a more convenient form, and with some custom methods.
The various parameters are::
type: token type
string: the token written as a string
start = (start_row, start_col)
end = (end_row, end_col)
line: entire line of code where the token is found.
Token instances are mutable objects. Therefore, given a list of tokens,
we can change the value of any token's attribute, untokenize the list and
automatically obtain a transformed source. Almost always, the attribute
to be transformed will be the string attribute.
"""
def __init__(self, token: _TokenInfo) -> None:
self.type = token[0]
self.string = token[1]
self.start = self.start_row, self.start_col = token[2]
self.end = self.end_row, self.end_col = token[3]
self.line = token[4]
def copy(self) -> "Token":
"""Makes a copy of a given token"""
return Token((self.type, self.string, self.start, self.end, self.line))
def __eq__(self, other: object) -> bool:
"""Compares a Token with another object; returns true if
self.string == other.string or if self.string == other.
"""
return self.string == str(other)
def __repr__(self) -> str: # pragma: no cover
"""Nicely formatted token to help with debugging session.
Note that it does **not** print a string representation that could be
used to create a new ``Token`` instance, which is something you should
never need to do other than indirectly by using the functions
provided in this module.
"""
return _token_format.format(
type="%s (%s)" % (self.type, py_tokenize.tok_name[self.type]),
string=repr(self.string),
start=str(self.start),
end=str(self.end),
line=repr(self.line),
)
def __str__(self) -> str:
"""Returns the string attribute."""
return self.string
def __len__(self):
"""Returns the length of the string attribute"""
return len(self.string)
def is_comment(self) -> bool:
"""Returns True if the token is a comment."""
return self.type == py_tokenize.COMMENT
def is_identifier(self) -> bool:
"""Returns ``True`` if the token represents a valid Python identifier
excluding Python keywords.
Note: this is different from Python's string method ``isidentifier``
which also returns ``True`` if the string is a keyword.
"""
return self.string.isidentifier() and not self.is_keyword()
def is_name(self) -> bool:
"""Returns ``True`` if the token is a type NAME"""
return self.type == py_tokenize.NAME
def is_keyword(self) -> bool:
"""Returns True if the token represents a Python keyword."""
return keyword.iskeyword(self.string) or self.string in ["__debug__", "..."]
def is_number(self) -> bool:
"""Returns True if the token represents a number of any type"""
return self.type == py_tokenize.NUMBER
def is_operator(self) -> bool:
"""Returns true if the token is of type OP"""
return self.type == py_tokenize.OP
def is_float(self) -> bool:
"""Returns True if the token represents a float"""
return self.is_number() and isinstance(ast.literal_eval(self.string), float)
def is_integer(self) -> bool:
"""Returns True if the token represents an integer"""
return self.is_number() and isinstance(ast.literal_eval(self.string), int)
def is_complex(self) -> bool:
"""Returns True if the token represents a complex number"""
return self.is_number() and isinstance(ast.literal_eval(self.string), complex)
def is_space(self) -> bool:
"""Returns True if the token indicates a change in indentation,
the end of a line, or the end of the source
(``INDENT``, ``DEDENT``, ``NEWLINE``, ``NL``, and ``ENDMARKER``).
Note that spaces, including tab characters ``\\t``, between tokens
on a given line are not considered to be tokens themselves.
"""
return self.type in (
py_tokenize.INDENT,
py_tokenize.DEDENT,
py_tokenize.NEWLINE,
py_tokenize.NL,
py_tokenize.ENDMARKER,
)
def is_string(self) -> bool:
"""Returns True if the token is a string"""
return self.type == py_tokenize.STRING
def is_f_string(self) -> bool:
"""Return True if the token is an f-string"""
return self.type == py_tokenize.STRING and (
self.string.startswith("f") or self.string.startswith("F")
)
def is_unclosed_string(self) -> bool:
"""Returns True if the token is part of an unclosed triple-quoted string"""
return self.type == UNCLOSED
def immediately_before(self, other: Any) -> bool:
"""Returns True if the current token is immediately before other,
without any intervening space in between the two tokens.
"""
if not isinstance(other, Token): # pragma: no cover
return False
return self.end_row == other.start_row and self.end_col == other.start_col
def immediately_after(self, other: Any) -> bool:
"""Returns True if the current token is immediately after other,
without any intervening space in between the two tokens.
"""
if not isinstance(other, Token): # pragma: no cover
return False
return other.immediately_before(self)
def is_error(self) -> bool:
"""Returns True if the current token is an error token"""
return self.type == py_tokenize.ERRORTOKEN
def name(self) -> str:
"""Returns the name of the character type"""
return py_tokenize.tok_name[self.type]
def is_assignment(op: Union[str, Token]) -> bool:
"""Returns True if op (string or Token) is an assigment or augmented assignment."""
ops = [
"=",
"+=",
"-=",
"*=",
"@=",
"/=",
"//=",
"%=",
"**=",
">>=",
"<<=",
"&=",
"^=",
"|=",
]
if sys.version_info >= (3, 8):
ops.append(":=")
return str(op) in ops
def is_bitwise(op: Union[str, Token]) -> bool:
"""Returns True if op (string or Token) is a bitwise operator."""
ops = ["^", "&", "|", "<<", ">>", "~"]
return str(op) in ops
def is_comparison(op: Union[str, Token]) -> bool:
"""Returns True if op (string or Token) is a comparison operator."""
ops = ["<", ">", "<=", ">=", "==", "!="]
return str(op) in ops
def is_math_op(op: Union[str, Token]) -> bool:
"""Returns True if op (string or Token) is an operator that can be used
as a binary operator in a mathematical operation.
"""
ops = ["+", "-", "*", "**", "@", "/", "//", "%"]
return str(op) in ops
def is_operator(op: Union[str, Token]) -> bool:
"""Returns True if op (string or token) is or could be part of one
of the following: assigment operator, mathematical operator,
bitwise operator, comparison operator."""
part_ops = ["!", ":"]
return (
is_assignment(op)
or is_bitwise(op)
or is_comparison(op)
or is_math_op(op)
or str(op) in part_ops
)
def fix_empty_last_line(source: str, tokens: Sequence[Token]) -> None:
"""Python's tokenizer drops entirely a last line if it consists only of
space characters and/or tab characters. To ensure that we can always have::
untokenize(tokenize(source)) == source
we correct the last token content by modifying ``tokens`` in place.
"""
if not tokens:
return
nb = 0
for char in reversed(source):
if char in (" ", "\t"):
nb += 1
else:
break
last_token = tokens.pop()
row = last_token.start_row
# When dealing with an empty line, Python 3.12 generate an NL token on the last line
# and adds a ENDMARKER token on the next (non-existent) line.
# For previous version no NL token was inserted.
if (
sys.version_info >= (3, 12)
and len(tokens) > 1
and tokens[-1].type == py_tokenize.NL
):
prev_token = tokens.pop()
if last_token.start_row != prev_token.start_row:
row = prev_token.start_row
last_token.string = source[-nb:]
last_token.start = (row, last_token.start_col)
last_token.end = (row, last_token.end_col + len(last_token.string))
last_token.line = last_token.string
tokens.append(last_token)
def tokenize(source: str) -> List[Token]:
"""Transforms a source (string) into a list of Tokens.
If an exception is raised by Python's tokenize module, the list of tokens
accumulated up to that point is returned.
"""
tokens = []
try:
for tok in py_tokenize.generate_tokens(StringIO(source).readline):
token = Token(tok)
tokens.append(token)
except IndentationError as e:
try:
_ignore, linenumber, col, line = e.args[1]
type_ = py_tokenize.NAME # Not really relevant what we set here
# except that ERRORTOKEN would cause problems later on.
start = (linenumber, col)
end = (linenumber, len(line))
string = line[col:].strip()
token = Token((type_, string, start, end, line))
tokens.append(token)
return tokens
except Exception as e: # pragma: no cover
debug_helper.log(
"after IndentationError, error from token_utils.tokenize()"
)
debug_helper.log(repr(e))
return tokens
except (py_tokenize.TokenError, Exception):
pass
new_source = untokenize(tokens)
if not source.strip(): # Used to prevent "fix" to be applied to
return tokens # MEANINGLESS_TOKEN defined elsewhere
if new_source != source:
length = len(new_source)
remaining = source[length:]
if not (
remaining.lstrip().startswith(('"""', "'''"))
or remaining.lstrip().startswith(("'", '"'))
):
if sys.version_info >= (3, 12):
tokens = append_missing_tokens(tokens, remaining)
return tokens
elif tokens:
return add_unclosed_string_content(tokens, remaining, new_source)
else:
debug_helper.log("Problem: could not successfully tokenize the source.")
return []
if source.endswith((" ", "\t")):
fix_empty_last_line(source, tokens)
return tokens
def add_unclosed_string_content(tokens, remaining, new_source):
additional_lines = [line + "\n" for line in remaining.split("\n")]
# removed extra \n added on last line
additional_lines[-1] = additional_lines[-1][0:-1]
last_token = tokens[-1]
string = additional_lines[0]
if new_source.endswith("\n"):
start_row = last_token.end_row + 1
start_col = 0
end_col = len(string)
line = string
else:
spaces_before_quotes = len(string) - len(string.lstrip())
start_row = last_token.end_row
start_col = last_token.end_col + spaces_before_quotes
string = string.lstrip()
end_col = start_col + len(string)
line = last_token.string + string
end_row = start_row
tokens.append(
Token((UNCLOSED, string, (start_row, start_col), (end_row, end_col), line))
)
for line in additional_lines[1:]:
start_row += 1
end_row = start_row
start_col = 0
end_col = len(line)
tokens.append(
Token(
(
UNCLOSED,
line,
(start_row, start_col),
(end_row, end_col),
line,
)
)
)
return tokens
def append_missing_tokens(tokens, remaining):
"""With Python 3.12, the tokenizer changed significantly and can drop content
when invalid code is encountered. This is an attempt to provide a
sufficient fix for friendly-traceback. Note that this will not guarantee that
source == untokenize(tokenize(source))
but should be sufficient for providing the relevant information for
SyntaxError cases.
See https://github.com/friendly-traceback/friendly-traceback/issues/242.
"""
rest_of_line = remaining.split("\n")[0]
if not tokens:
start_row = 1
start_col = 0
line = rest_of_line
else:
start_row, start_col = tokens[-1].end
line = tokens[-1].line
stripped_remaining = rest_of_line.lstrip()
start_col += len(rest_of_line) - len(stripped_remaining)
offset = 0
if stripped_remaining.startswith(("0o", "0O")):
# find first offending digit
for ch in stripped_remaining:
if ch in {"8", "9"}:
break
offset += 1
else:
debug_helper.log("Did not find disallowed octal digit")
return tokens
tok_type = py_tokenize.NUMBER
tok_string = stripped_remaining[:offset]
elif is_invisible_control_character(stripped_remaining[0]):
tok_type = py_tokenize.STRING
tok_string = stripped_remaining[0]
else:
return tokens
tokens.append(
Token(
(
tok_type,
tok_string,
(start_row, start_col),
(start_row, start_col + offset),
line,
)
)
)
source_rest = rest_of_line[offset + 1 :]
remaining_tokens = tokenize(source_rest)
for tok in remaining_tokens:
if not tok.string:
tok.line = ""
else:
tok.line = line
if tok.start_col == 0 and tok.string == "": # Endmarker
tok.start = tok.end = (tok.start_row, tok.start_col) = (
tok.end_row,
tok.end_col,
) = (start_row + 1, 0)
else:
tok.start = (tok.start_row, tok.start_col) = (
start_row,
tok.start_col + start_col + offset,
)
tok.end = (tok.end_row, tok.end_col) = (
start_row,
tok.end_col + start_col + offset,
)
tokens.append(tok)
return tokens
def get_significant_tokens(source: str) -> List[Token]:
"""Gets a list of tokens from a source (str), ignoring comments
as well as any token whose string value is either null or
consists of spaces, newline or tab characters.
"""
try:
tokens = tokenize(source)
except Exception as e: # pragma: no cover
debug_helper.log("Exception from token_utils.get_significant_tokens()")
debug_helper.log_error(e)
return []
return remove_meaningless_tokens(tokens)
def remove_meaningless_tokens(tokens: Iterable[Token]) -> List[Token]:
"""Given a list of tokens, remove all space-like tokens and comments."""
new_tokens = []
for tok in tokens:
if not tok.string.strip() or tok.is_comment():
continue
new_tokens.append(tok)
return new_tokens
def get_lines(source: str) -> List[List[Token]]:
"""Transforms a source (string) into a list of Tokens, with each
(inner) list containing all the tokens found on a given line of code.
"""
lines: List[List[Token]] = []
current_row = -1
new_line: List[Token] = []
tokens = tokenize(source)
if not tokens:
return [[]]
new_line = [tokens[0]]
for token in tokens[1:]:
if token.start_row != current_row:
current_row = token.start_row
if new_line:
lines.append(new_line)
new_line = []
new_line.append(token)
lines.append(new_line)
return lines
def strip_comment(line: str) -> str:
"""Removes comments from a line"""
tokens = []
try:
for tok in py_tokenize.generate_tokens(StringIO(line).readline):
token = Token(tok)
if token.is_comment():
continue
if not token.string:
token.line = ""
tokens.append(token)
except py_tokenize.TokenError:
pass
return untokenize(tokens)
def find_substring_index(main: str, substring: str) -> int:
"""Somewhat similar to the find() method for strings,
this function determines if the tokens for substring appear
as a subsequence of the tokens for main. If so, the index
of the first token in returned, otherwise -1 is returned.
"""
main_tokens = [tok.string for tok in get_significant_tokens(main)]
sub_tokens = [tok.string for tok in get_significant_tokens(substring)]
for index, token in enumerate(main_tokens):
if token == sub_tokens[0]:
for i, tok in enumerate(main_tokens[index : index + len(sub_tokens)]):
if tok != sub_tokens[i]:
break
else:
return index
return -1
def dedent(tokens: Iterable[Union[str, Token]], nb: int) -> List[Token]:
"""Given a list of tokens, produces an equivalent list corresponding
to a line of code with the first nb characters removed.
"""
line = untokenize(tokens)
line = line[nb:]
return tokenize(line)
def indent(
tokens: Iterable[Union[str, Token]], nb: int, tab: bool = False
) -> List[Token]:
"""Given a list of tokens, produces an equivalent list corresponding
to a line of code with nb space characters inserted at the beginning.
If ``tab`` is specified to be ``True``, ``nb`` tab characters are inserted
instead of spaces.
"""
line = untokenize(tokens)
line = "\t" * nb + line if tab else " " * nb + line
return tokenize(line)
def untokenize(tokens: Iterable[Union[str, Token]]) -> str:
"""Return source code based on tokens.
This is similar to Python's own tokenize.untokenize(), except that it
preserves spacing between tokens, by using the line
information recorded by Python's tokenize.generate_tokens.
As a result, if the original source code had multiple spaces between
some tokens or if escaped newlines were used or if tab characters
were present in the original source, those will also be present
in the source code produced by untokenize.
Thus ``source == untokenize(tokenize(source))``.
Note: if you you modifying tokens from an original source:
Instead of full token object, ``untokenize`` will accept simple
strings; however, it will only insert them *as is* without taking them
into account when it comes with figuring out spacing between tokens.
"""
# Adapted from https://github.com/myint/untokenize,
# Copyright (C) 2013-2018 Steven Myint, MIT License (same as this project).
words = []
previous_line = ""
last_row = 0
last_column = -1
last_non_whitespace_token_type = None
for token in tokens:
if isinstance(token, str): # pragma: no cover
words.append(token)
continue
if token.type == py_tokenize.ENCODING: # pragma: no cover
continue
# Preserve escaped newlines.
if (
last_non_whitespace_token_type != py_tokenize.COMMENT
and token.start_row > last_row
and previous_line.endswith(("\\\n", "\\\r\n", "\\\r"))
):
words.append(previous_line[len(previous_line.rstrip(" \t\n\r\\")) :])
# Preserve spacing.
if token.start_row > last_row:
last_column = 0
if token.start_col > last_column:
words.append(token.line[last_column : token.start_col])
words.append(token.string)
previous_line = token.line
last_row = token.end_row
last_column = token.end_col
if not token.is_space():
last_non_whitespace_token_type = token.type
return "".join(words)
TextOrTokens = Union[str, Sequence[Union[str, Token]]]
def print_tokens(source: TextOrTokens) -> None: # pragma: no cover
"""Prints tokens found in source, excluding spaces and comments.
``source`` is either a string to be tokenized, or a list of Token objects.
This is occasionally useful as a debugging tool.
"""
if isinstance(source[0], Token):
source = untokenize(source)
for lines in get_lines(source): # type: ignore
for token in lines:
print(repr(token))
print()
def is_invisible_control_character(char):
"""Used to determine if a character would be visible when printed."""
if len(char) != 1: # protect against invalid input
return False
n = ord(char)
if 0 <= n <= 0x1F or n == 0x7F or 0x80 <= n <= 0x9F:
return char
return False
def clone(token):
return Token(
[
token.type,
token.string,
(token.start_row, token.start_col),
(token.end_row, token.end_col),
token.line,
]
)
|