1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
|
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
#
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
#
# A fork of Parso's tokenize test
# https://github.com/davidhalter/parso/blob/master/test/test_tokenize.py
#
# The following changes were made:
# - Convert base test to Unittet
# - Remove grammar-specific tests
# pyre-unsafe
# -*- coding: utf-8 # This file contains Unicode characters.
from textwrap import dedent
from libcst._parser.parso.python.token import PythonTokenTypes
from libcst._parser.parso.python.tokenize import PythonToken, tokenize
from libcst._parser.parso.utils import parse_version_string, split_lines
from libcst.testing.utils import data_provider, UnitTest
# To make it easier to access some of the token types, just put them here.
NAME = PythonTokenTypes.NAME
NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING
NUMBER = PythonTokenTypes.NUMBER
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
OP = PythonTokenTypes.OP
ENDMARKER = PythonTokenTypes.ENDMARKER
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END
def _get_token_list(string, version=None):
# Load the current version.
version_info = parse_version_string(version)
return list(tokenize(string, version_info))
class ParsoTokenizerTest(UnitTest):
def test_simple_no_whitespace(self):
# Test a simple one line string, no preceding whitespace
simple_docstring = '"""simple one line docstring"""'
token_list = _get_token_list(simple_docstring)
_, value, _, prefix = token_list[0]
assert prefix == ""
assert value == '"""simple one line docstring"""'
def test_simple_with_whitespace(self):
# Test a simple one line string with preceding whitespace and newline
simple_docstring = ' """simple one line docstring""" \r\n'
token_list = _get_token_list(simple_docstring)
assert token_list[0][0] == INDENT
typ, value, start_pos, prefix = token_list[1]
assert prefix == " "
assert value == '"""simple one line docstring"""'
assert typ == STRING
typ, value, start_pos, prefix = token_list[2]
assert prefix == " "
assert typ == NEWLINE
def test_function_whitespace(self):
# Test function definition whitespace identification
fundef = dedent(
"""
def test_whitespace(*args, **kwargs):
x = 1
if x > 0:
print(True)
"""
)
token_list = _get_token_list(fundef)
for _, value, _, prefix in token_list:
if value == "test_whitespace":
assert prefix == " "
if value == "(":
assert prefix == ""
if value == "*":
assert prefix == ""
if value == "**":
assert prefix == " "
if value == "print":
assert prefix == " "
if value == "if":
assert prefix == " "
def test_tokenize_multiline_I(self):
# Make sure multiline string having newlines have the end marker on the
# next line
fundef = '''""""\n'''
token_list = _get_token_list(fundef)
assert token_list == [
PythonToken(ERRORTOKEN, '""""\n', (1, 0), ""),
PythonToken(ENDMARKER, "", (2, 0), ""),
]
def test_tokenize_multiline_II(self):
# Make sure multiline string having no newlines have the end marker on
# same line
fundef = '''""""'''
token_list = _get_token_list(fundef)
assert token_list == [
PythonToken(ERRORTOKEN, '""""', (1, 0), ""),
PythonToken(ENDMARKER, "", (1, 4), ""),
]
def test_tokenize_multiline_III(self):
# Make sure multiline string having newlines have the end marker on the
# next line even if several newline
fundef = '''""""\n\n'''
token_list = _get_token_list(fundef)
assert token_list == [
PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ""),
PythonToken(ENDMARKER, "", (3, 0), ""),
]
def test_identifier_contains_unicode(self):
fundef = dedent(
"""
def 我あφ():
pass
"""
)
token_list = _get_token_list(fundef)
unicode_token = token_list[1]
assert unicode_token[0] == NAME
def test_ur_literals(self):
"""
Decided to parse `u''` literals regardless of Python version. This makes
probably sense:
- Python 3+ doesn't support it, but it doesn't hurt
not be. While this is incorrect, it's just incorrect for one "old" and in
the future not very important version.
- All the other Python versions work very well with it.
"""
def check(literal, is_literal=True):
token_list = _get_token_list(literal)
typ, result_literal, _, _ = token_list[0]
if is_literal:
if typ != FSTRING_START:
assert typ == STRING
assert result_literal == literal
else:
assert typ == NAME
check('u""')
check('ur""', is_literal=False)
check('Ur""', is_literal=False)
check('UR""', is_literal=False)
check('bR""')
# Starting with Python 3.3 this ordering is also possible.
check('Rb""')
# Starting with Python 3.6 format strings where introduced.
check('fr""', is_literal=True)
check('rF""', is_literal=True)
check('f""', is_literal=True)
check('F""', is_literal=True)
def test_error_literal(self):
error_token, newline, endmarker = _get_token_list('"\n')
assert error_token.type == ERRORTOKEN
assert error_token.string == '"'
assert newline.type == NEWLINE
assert endmarker.type == ENDMARKER
assert endmarker.prefix == ""
bracket, error_token, endmarker = _get_token_list('( """')
assert error_token.type == ERRORTOKEN
assert error_token.prefix == " "
assert error_token.string == '"""'
assert endmarker.type == ENDMARKER
assert endmarker.prefix == ""
def test_endmarker_end_pos(self):
def check(code):
tokens = _get_token_list(code)
lines = split_lines(code)
assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
check("#c")
check("#c\n")
check("a\n")
check("a")
check(r"a\\n")
check("a\\")
@data_provider(
(
# Indentation
(" foo", [INDENT, NAME, DEDENT]),
(" foo\n bar", [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(
" foo\n bar \n baz",
[
INDENT,
NAME,
NEWLINE,
ERROR_DEDENT,
NAME,
NEWLINE,
ERROR_DEDENT,
NAME,
DEDENT,
],
),
(" foo\nbar", [INDENT, NAME, NEWLINE, DEDENT, NAME]),
# Name stuff
("1foo1", [NUMBER, NAME]),
("மெல்லினம்", [NAME]),
("²", [ERRORTOKEN]),
("ä²ö", [NAME, ERRORTOKEN, NAME]),
("ää²¹öö", [NAME, ERRORTOKEN, NAME]),
)
)
def test_token_types(self, code, types):
actual_types = [t.type for t in _get_token_list(code)]
assert actual_types == types + [ENDMARKER]
def test_error_string(self):
t1, newline, endmarker = _get_token_list(' "\n')
assert t1.type == ERRORTOKEN
assert t1.prefix == " "
assert t1.string == '"'
assert newline.type == NEWLINE
assert endmarker.prefix == ""
assert endmarker.string == ""
def test_indent_error_recovery(self):
code = dedent(
"""\
str(
from x import a
def
"""
)
lst = _get_token_list(code)
expected = [
# `str(`
INDENT,
NAME,
OP,
# `from parso`
NAME,
NAME,
# `import a` on same line as the previous from parso
NAME,
NAME,
NEWLINE,
# Dedent happens, because there's an import now and the import
# statement "breaks" out of the opening paren on the first line.
DEDENT,
# `b`
NAME,
NEWLINE,
ENDMARKER,
]
assert [t.type for t in lst] == expected
def test_error_token_after_dedent(self):
code = dedent(
"""\
class C:
pass
$foo
"""
)
lst = _get_token_list(code)
expected = [
NAME,
NAME,
OP,
NEWLINE,
INDENT,
NAME,
NEWLINE,
DEDENT,
# $foo\n
ERRORTOKEN,
NAME,
NEWLINE,
ENDMARKER,
]
assert [t.type for t in lst] == expected
def test_brackets_no_indentation(self):
"""
There used to be an issue that the parentheses counting would go below
zero. This should not happen.
"""
code = dedent(
"""\
}
{
}
"""
)
lst = _get_token_list(code)
assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]
def test_form_feed(self):
error_token, endmarker = _get_token_list(
dedent(
'''\
\f"""'''
)
)
assert error_token.prefix == "\f"
assert error_token.string == '"""'
assert endmarker.prefix == ""
def test_carriage_return(self):
lst = _get_token_list(" =\\\rclass")
assert [t.type for t in lst] == [INDENT, OP, DEDENT, NAME, ENDMARKER]
def test_backslash(self):
code = "\\\n# 1 \n"
(endmarker,) = _get_token_list(code)
assert endmarker.prefix == code
@data_provider(
(
('f"', [FSTRING_START], "3.7"),
('f""', [FSTRING_START, FSTRING_END], "3.7"),
('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END], "3.7"),
('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP], "3.7"),
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
# format spec
(
r'f"Some {x:.2f}{y}"',
[
FSTRING_START,
FSTRING_STRING,
OP,
NAME,
OP,
FSTRING_STRING,
OP,
OP,
NAME,
OP,
FSTRING_END,
],
"3.7",
),
# multiline f-string
('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
(
'f"""abc{\n123}def"""',
[
FSTRING_START,
FSTRING_STRING,
OP,
NUMBER,
OP,
FSTRING_STRING,
FSTRING_END,
],
"3.7",
),
# a line continuation inside of an fstring_string
('f"abc\\\ndef"', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
(
'f"\\\n{123}\\\n"',
[
FSTRING_START,
FSTRING_STRING,
OP,
NUMBER,
OP,
FSTRING_STRING,
FSTRING_END,
],
"3.7",
),
# a line continuation inside of an fstring_expr
('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END], "3.7"),
# a line continuation inside of an format spec
(
'f"{123:.2\\\nf}"',
[FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END],
"3.7",
),
# a newline without a line continuation inside a single-line string is
# wrong, and will generate an ERRORTOKEN
(
'f"abc\ndef"',
[FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN],
"3.7",
),
# a more complex example
(
r'print(f"Some {x:.2f}a{y}")',
[
NAME,
OP,
FSTRING_START,
FSTRING_STRING,
OP,
NAME,
OP,
FSTRING_STRING,
OP,
FSTRING_STRING,
OP,
NAME,
OP,
FSTRING_END,
OP,
],
"3.7",
),
)
)
def test_fstring(self, code, types, py_version):
actual_types = [t.type for t in _get_token_list(code, py_version)]
assert types + [ENDMARKER] == actual_types
|