1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
|
from pypy.interpreter.astcompiler import ast, consts
from pypy.interpreter.pyparser import parsestring
from pypy.interpreter import error
from pypy.interpreter import unicodehelper
from rpython.rlib.rstring import StringBuilder
def add_constant_string(astbuilder, joined_pieces, w_string, atom_node):
space = astbuilder.space
is_unicode = space.isinstance_w(w_string, space.w_unicode)
# Implement implicit string concatenation.
if joined_pieces:
prev = joined_pieces[-1]
if is_unicode and isinstance(prev, ast.Str):
w_string = space.add(prev.s, w_string)
del joined_pieces[-1]
elif not is_unicode and isinstance(prev, ast.Bytes):
w_string = space.add(prev.s, w_string)
del joined_pieces[-1]
node = ast.Str if is_unicode else ast.Bytes
joined_pieces.append(node(w_string, atom_node.get_lineno(),
atom_node.get_column()))
def f_constant_string(astbuilder, joined_pieces, u, atom_node):
space = astbuilder.space
add_constant_string(astbuilder, joined_pieces, space.newunicode(u),
atom_node)
def f_string_compile(astbuilder, source, atom_node):
# Note: a f-string is kept as a single literal up to here.
# At this point only, we recursively call the AST compiler
# on all the '{expr}' parts. The 'expr' part is not parsed
# or even tokenized together with the rest of the source code!
from pypy.interpreter.pyparser import pyparse
from pypy.interpreter.astcompiler.astbuilder import ast_from_node
# complain if 'source' is only whitespace or an empty string
for c in source:
if c not in ' \t\n\r\v\f':
break
else:
astbuilder.error("f-string: empty expression not allowed", atom_node)
if astbuilder.recursive_parser is None:
astbuilder.error("internal error: parser not available for parsing "
"the expressions inside the f-string", atom_node)
assert isinstance(source, str) # utf-8 encoded
source = '(%s)' % source
info = pyparse.CompileInfo("<fstring>", "eval",
consts.PyCF_SOURCE_IS_UTF8 |
consts.PyCF_IGNORE_COOKIE,
optimize=astbuilder.compile_info.optimize)
parser = astbuilder.recursive_parser
parse_tree = parser.parse_source(source, info)
return ast_from_node(astbuilder.space, parse_tree, info,
recursive_parser=parser)
def unexpected_end_of_string(astbuilder, atom_node):
astbuilder.error("f-string: expecting '}'", atom_node)
def fstring_find_expr(astbuilder, fstr, atom_node, rec):
# Parse the f-string at fstr.current_index. We know it starts an
# expression (so it must be at '{'). Returns the FormattedValue node,
# which includes the expression, conversion character, and
# format_spec expression.
conversion = -1 # the conversion char. -1 if not specified.
format_spec = None
# 0 if we're not in a string, else the quote char we're trying to
# match (single or double quote).
quote_char = 0
# If we're inside a string, 1=normal, 3=triple-quoted.
string_type = 0
# Keep track of nesting level for braces/parens/brackets in
# expressions.
nested_depth = 0
# Can only nest one level deep.
if rec >= 2:
astbuilder.error("f-string: expressions nested too deeply", atom_node)
# The first char must be a left brace, or we wouldn't have gotten
# here. Skip over it.
s = fstr.unparsed
i = fstr.current_index
assert s[i] == '{'
i += 1
expr_start = i
while i < len(s):
# Loop invariants.
assert nested_depth >= 0
if quote_char:
assert string_type == 1 or string_type == 3
else:
assert string_type == 0
ch = s[i]
# Nowhere inside an expression is a backslash allowed.
if ch == '\\':
# Error: can't include a backslash character, inside
# parens or strings or not.
astbuilder.error("f-string expression part "
"cannot include a backslash", atom_node)
if quote_char:
# We're inside a string. See if we're at the end.
# <a long comment goes here about how we're duplicating
# some existing logic>
if ord(ch) == quote_char:
# Does this match the string_type (single or triple
# quoted)?
if string_type == 3:
if i + 2 < len(s) and s[i + 1] == s[i + 2] == ch:
# We're at the end of a triple quoted string.
i += 3
string_type = 0
quote_char = 0
continue
else:
# We're at the end of a normal string.
i += 1
string_type = 0
quote_char = 0
continue
elif ch == "'" or ch == '"':
# Is this a triple quoted string?
if i + 2 < len(s) and s[i + 1] == s[i + 2] == ch:
string_type = 3
i += 2
else:
# Start of a normal string.
string_type = 1
# Start looking for the end of the string.
quote_char = ord(ch)
elif ch in "[{(":
nested_depth += 1
elif nested_depth != 0 and ch in "]})":
nested_depth -= 1
elif ch == '#':
# Error: can't include a comment character, inside parens
# or not.
astbuilder.error("f-string expression part cannot include '#'",
atom_node)
elif nested_depth == 0 and ch in "!:}":
# First, test for the special case of "!=". Since '=' is
# not an allowed conversion character, nothing is lost in
# this test.
if ch == '!' and i + 1 < len(s) and s[i+1] == '=':
# This isn't a conversion character, just continue.
i += 1
continue
# Normal way out of this loop.
break
#else:
# This isn't a conversion character, just continue.
i += 1
# If we leave this loop in a string or with mismatched parens, we
# don't care. We'll get a syntax error when compiling the
# expression. But, we can produce a better error message, so
# let's just do that.
if quote_char:
astbuilder.error("f-string: unterminated string", atom_node)
if nested_depth:
astbuilder.error("f-string: mismatched '(', '{' or '['", atom_node)
if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
# Compile the expression as soon as possible, so we show errors
# related to the expression before errors related to the
# conversion or format_spec.
expr = f_string_compile(astbuilder, s[expr_start:i], atom_node)
assert isinstance(expr, ast.Expression)
# Check for a conversion char, if present.
if s[i] == '!':
i += 1
if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
conversion = ord(s[i])
i += 1
if conversion not in (ord('s'), ord('r'), ord('a')):
astbuilder.error("f-string: invalid conversion character: "
"expected 's', 'r', or 'a'", atom_node)
# Check for the format spec, if present.
if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
if s[i] == ':':
i += 1
if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
fstr.current_index = i
subpieces = []
parse_f_string(astbuilder, subpieces, fstr, atom_node, rec + 1)
format_spec = f_string_to_ast_node(astbuilder, subpieces, atom_node)
i = fstr.current_index
if i >= len(s) or s[i] != '}':
unexpected_end_of_string(astbuilder, atom_node)
# We're at a right brace. Consume it.
i += 1
fstr.current_index = i
# And now create the FormattedValue node that represents this
# entire expression with the conversion and format spec.
return ast.FormattedValue(expr.body, conversion, format_spec,
atom_node.get_lineno(),
atom_node.get_column())
def fstring_find_literal(astbuilder, fstr, atom_node, rec):
# Return the next literal part. Updates the current index inside 'fstr'.
# Differs from CPython: this version handles double-braces on its own.
s = fstr.unparsed
literal_start = fstr.current_index
in_named_escape = False
# Get any literal string. It ends when we hit an un-doubled left
# brace (which isn't part of a unicode name escape such as
# "\N{EULER CONSTANT}"), or the end of the string.
i = literal_start
builder = StringBuilder()
while i < len(s):
ch = s[i]
if (not in_named_escape and ch == '{' and i - literal_start >= 2
and s[i - 2] == '\\' and s[i - 1] == 'N'):
in_named_escape = True
elif in_named_escape and ch == '}':
in_named_escape = False
elif ch == '{' or ch == '}':
# Check for doubled braces, but only at the top level. If
# we checked at every level, then f'{0:{3}}' would fail
# with the two closing braces.
if rec == 0 and i + 1 < len(s) and s[i + 1] == ch:
i += 1 # skip over the second brace
elif rec == 0 and ch == '}':
# Where a single '{' is the start of a new expression, a
# single '}' is not allowed.
astbuilder.error("f-string: single '}' is not allowed",
atom_node)
else:
# We're either at a '{', which means we're starting another
# expression; or a '}', which means we're at the end of this
# f-string (for a nested format_spec).
break
builder.append(ch)
i += 1
fstr.current_index = i
literal = builder.build()
if not fstr.raw_mode and '\\' in literal:
space = astbuilder.space
literal = parsestring.decode_unicode_utf8(space, literal, 0,
len(literal))
return unicodehelper.decode_unicode_escape(space, literal)
else:
return literal.decode('utf-8')
def fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec):
# Return a tuple with the next literal part, and optionally the
# following expression node. Updates the current index inside 'fstr'.
literal = fstring_find_literal(astbuilder, fstr, atom_node, rec)
s = fstr.unparsed
i = fstr.current_index
if i >= len(s) or s[i] == '}':
# We're at the end of the string or the end of a nested
# f-string: no expression.
expr = None
else:
# We must now be the start of an expression, on a '{'.
assert s[i] == '{'
expr = fstring_find_expr(astbuilder, fstr, atom_node, rec)
return literal, expr
def parse_f_string(astbuilder, joined_pieces, fstr, atom_node, rec=0):
# In our case, parse_f_string() and fstring_find_literal_and_expr()
# could be merged into a single function with a clearer logic. It's
# done this way to follow CPython's source code more closely.
space = astbuilder.space
if not space.config.objspace.fstrings:
raise astbuilder.error(
"f-strings have been disabled in this version of pypy "
"with the translation option '--no-objspace-fstrings'. "
"The PyPy team (and CPython) thinks f-strings don't "
"add any security risks, but we leave it to you to "
"convince whoever translated this pypy that it is "
"really the case", atom_node)
while True:
literal, expr = fstring_find_literal_and_expr(astbuilder, fstr,
atom_node, rec)
# add the literal part
f_constant_string(astbuilder, joined_pieces, literal, atom_node)
if expr is None:
break # We're done with this f-string.
joined_pieces.append(expr)
# If recurse_lvl is zero, then we must be at the end of the
# string. Otherwise, we must be at a right brace.
if rec == 0 and fstr.current_index < len(fstr.unparsed) - 1:
astbuilder.error("f-string: unexpected end of string", atom_node)
if rec != 0 and (fstr.current_index >= len(fstr.unparsed) or
fstr.unparsed[fstr.current_index] != '}'):
astbuilder.error("f-string: expecting '}'", atom_node)
def f_string_to_ast_node(astbuilder, joined_pieces, atom_node):
# Remove empty Strs, but always return an ast.JoinedStr object.
# In this way it cannot be grabbed later for being used as a
# docstring. In codegen.py we still special-case length-1 lists
# and avoid calling "BUILD_STRING 1" in this case.
space = astbuilder.space
values = [node for node in joined_pieces
if not isinstance(node, ast.Str)
or space.is_true(node.s)]
return ast.JoinedStr(values, atom_node.get_lineno(),
atom_node.get_column())
def string_parse_literal(astbuilder, atom_node):
space = astbuilder.space
encoding = astbuilder.compile_info.encoding
joined_pieces = []
fmode = False
for i in range(atom_node.num_children()):
child = atom_node.get_child(i)
try:
w_next = parsestring.parsestr(
space, encoding, child.get_value())
if not isinstance(w_next, parsestring.W_FString):
add_constant_string(astbuilder, joined_pieces, w_next,
atom_node)
else:
parse_f_string(astbuilder, joined_pieces, w_next, atom_node)
fmode = True
except error.OperationError as e:
if e.match(space, space.w_UnicodeError):
kind = '(unicode error) '
elif e.match(space, space.w_ValueError):
kind = '(value error) '
elif e.match(space, space.w_SyntaxError):
kind = ''
else:
raise
# Unicode/ValueError/SyntaxError (without position information) in
# literal: turn into SyntaxError with position information
e.normalize_exception(space)
errmsg = space.text_w(space.str(e.get_w_value(space)))
raise astbuilder.error('%s%s' % (kind, errmsg), child)
if not fmode and len(joined_pieces) == 1: # <= the common path
return joined_pieces[0] # ast.Str, Bytes or FormattedValue
# with more than one piece, it is a combination of Str and
# FormattedValue pieces---if there is a Bytes, then we got
# an invalid mixture of bytes and unicode literals
for node in joined_pieces:
if isinstance(node, ast.Bytes):
astbuilder.error("cannot mix bytes and nonbytes literals",
atom_node)
assert fmode
return f_string_to_ast_node(astbuilder, joined_pieces, atom_node)
|