Author: Armin Rigo <[email protected]>
Branch: py3.5
Changeset: r89999:4fd6ad6559bf
Date: 2017-02-07 13:39 +0100
http://bitbucket.org/pypy/pypy/changeset/4fd6ad6559bf/
Log: fix and clean up of f-strings: now internally handled as utf-8,
instead of as unicode-that-can-only-contain-ascii-anyway
diff --git a/pypy/interpreter/astcompiler/fstring.py
b/pypy/interpreter/astcompiler/fstring.py
--- a/pypy/interpreter/astcompiler/fstring.py
+++ b/pypy/interpreter/astcompiler/fstring.py
@@ -2,7 +2,7 @@
from pypy.interpreter.pyparser import parsestring
from pypy.interpreter import error
from pypy.interpreter import unicodehelper
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
def add_constant_string(astbuilder, joined_pieces, w_string, atom_node):
@@ -44,7 +44,8 @@
if astbuilder.recursive_parser is None:
astbuilder.error("internal error: parser not available for parsing "
"the expressions inside the f-string", atom_node)
- source = '(%s)' % source.encode('utf-8')
+ assert isinstance(source, str) # utf-8 encoded
+ source = '(%s)' % source
info = pyparse.CompileInfo("<fstring>", "eval",
consts.PyCF_SOURCE_IS_UTF8 |
@@ -85,13 +86,13 @@
# The first char must be a left brace, or we wouldn't have gotten
# here. Skip over it.
- u = fstr.unparsed
+ s = fstr.unparsed
i = fstr.current_index
- assert u[i] == u'{'
+ assert s[i] == '{'
i += 1
expr_start = i
- while i < len(u):
+ while i < len(s):
# Loop invariants.
assert nested_depth >= 0
@@ -100,9 +101,9 @@
else:
assert string_type == 0
- ch = u[i]
+ ch = s[i]
# Nowhere inside an expression is a backslash allowed.
- if ch == u'\\':
+ if ch == '\\':
# Error: can't include a backslash character, inside
# parens or strings or not.
astbuilder.error("f-string expression part "
@@ -116,7 +117,7 @@
# Does this match the string_type (single or triple
# quoted)?
if string_type == 3:
- if i + 2 < len(u) and u[i + 1] == u[i + 2] == ch:
+ if i + 2 < len(s) and s[i + 1] == s[i + 2] == ch:
# We're at the end of a triple quoted string.
i += 3
string_type = 0
@@ -128,9 +129,9 @@
string_type = 0
quote_char = 0
continue
- elif ch == u"'" or ch == u'"':
+ elif ch == "'" or ch == '"':
# Is this a triple quoted string?
- if i + 2 < len(u) and u[i + 1] == u[i + 2] == ch:
+ if i + 2 < len(s) and s[i + 1] == s[i + 2] == ch:
string_type = 3
i += 2
else:
@@ -138,20 +139,20 @@
string_type = 1
# Start looking for the end of the string.
quote_char = ord(ch)
- elif ch in u"[{(":
+ elif ch in "[{(":
nested_depth += 1
- elif nested_depth != 0 and ch in u"]})":
+ elif nested_depth != 0 and ch in "]})":
nested_depth -= 1
- elif ch == u'#':
+ elif ch == '#':
# Error: can't include a comment character, inside parens
# or not.
astbuilder.error("f-string expression part cannot include '#'",
atom_node)
- elif nested_depth == 0 and ch in u"!:}":
+ elif nested_depth == 0 and ch in "!:}":
# First, test for the special case of "!=". Since '=' is
# not an allowed conversion character, nothing is lost in
# this test.
- if ch == '!' and i + 1 < len(u) and u[i+1] == u'=':
+ if ch == '!' and i + 1 < len(s) and s[i+1] == '=':
# This isn't a conversion character, just continue.
i += 1
continue
@@ -171,33 +172,33 @@
if nested_depth:
astbuilder.error("f-string: mismatched '(', '{' or '['", atom_node)
- if i >= len(u):
+ if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
# Compile the expression as soon as possible, so we show errors
# related to the expression before errors related to the
# conversion or format_spec.
- expr = f_string_compile(astbuilder, u[expr_start:i], atom_node)
+ expr = f_string_compile(astbuilder, s[expr_start:i], atom_node)
assert isinstance(expr, ast.Expression)
# Check for a conversion char, if present.
- if u[i] == u'!':
+ if s[i] == '!':
i += 1
- if i >= len(u):
+ if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
- conversion = ord(u[i])
+ conversion = ord(s[i])
i += 1
if conversion not in (ord('s'), ord('r'), ord('a')):
astbuilder.error("f-string: invalid conversion character: "
"expected 's', 'r', or 'a'", atom_node)
# Check for the format spec, if present.
- if i >= len(u):
+ if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
- if u[i] == u':':
+ if s[i] == ':':
i += 1
- if i >= len(u):
+ if i >= len(s):
unexpected_end_of_string(astbuilder, atom_node)
fstr.current_index = i
subpieces = []
@@ -205,7 +206,7 @@
format_spec = f_string_to_ast_node(astbuilder, subpieces, atom_node)
i = fstr.current_index
- if i >= len(u) or u[i] != u'}':
+ if i >= len(s) or s[i] != '}':
unexpected_end_of_string(astbuilder, atom_node)
# We're at a right brace. Consume it.
@@ -222,7 +223,7 @@
def fstring_find_literal(astbuilder, fstr, atom_node, rec):
# Return the next literal part. Updates the current index inside 'fstr'.
# Differs from CPython: this version handles double-braces on its own.
- u = fstr.unparsed
+ s = fstr.unparsed
literal_start = fstr.current_index
in_named_escape = False
@@ -230,21 +231,21 @@
# brace (which isn't part of a unicode name escape such as
# "\N{EULER CONSTANT}"), or the end of the string.
i = literal_start
- builder = UnicodeBuilder()
- while i < len(u):
- ch = u[i]
- if (not in_named_escape and ch == u'{' and i - literal_start >= 2
- and u[i - 2] == u'\\' and u[i - 1] == u'N'):
+ builder = StringBuilder()
+ while i < len(s):
+ ch = s[i]
+ if (not in_named_escape and ch == '{' and i - literal_start >= 2
+ and s[i - 2] == '\\' and s[i - 1] == 'N'):
in_named_escape = True
- elif in_named_escape and ch == u'}':
+ elif in_named_escape and ch == '}':
in_named_escape = False
- elif ch == u'{' or ch == u'}':
+ elif ch == '{' or ch == '}':
# Check for doubled braces, but only at the top level. If
# we checked at every level, then f'{0:{3}}' would fail
# with the two closing braces.
- if rec == 0 and i + 1 < len(u) and u[i + 1] == ch:
+ if rec == 0 and i + 1 < len(s) and s[i + 1] == ch:
i += 1 # skip over the second brace
- elif rec == 0 and ch == u'}':
+ elif rec == 0 and ch == '}':
# Where a single '{' is the start of a new expression, a
# single '}' is not allowed.
astbuilder.error("f-string: single '}' is not allowed",
@@ -259,14 +260,13 @@
fstr.current_index = i
literal = builder.build()
- if not fstr.raw_mode and u'\\' in literal:
- # xxx messy
+ if not fstr.raw_mode and '\\' in literal:
space = astbuilder.space
- literal = literal.encode('utf-8')
literal = parsestring.decode_unicode_utf8(space, literal, 0,
len(literal))
- literal = unicodehelper.decode_unicode_escape(space, literal)
- return literal
+ return unicodehelper.decode_unicode_escape(space, literal)
+ else:
+ return literal.decode('utf-8')
def fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec):
@@ -274,15 +274,15 @@
# following expression node. Updates the current index inside 'fstr'.
literal = fstring_find_literal(astbuilder, fstr, atom_node, rec)
- u = fstr.unparsed
+ s = fstr.unparsed
i = fstr.current_index
- if i >= len(u) or u[i] == u'}':
+ if i >= len(s) or s[i] == '}':
# We're at the end of the string or the end of a nested
# f-string: no expression.
expr = None
else:
# We must now be the start of an expression, on a '{'.
- assert u[i] == u'{'
+ assert s[i] == '{'
expr = fstring_find_expr(astbuilder, fstr, atom_node, rec)
return literal, expr
@@ -320,14 +320,16 @@
astbuilder.error("f-string: unexpected end of string", atom_node)
if rec != 0 and (fstr.current_index >= len(fstr.unparsed) or
- fstr.unparsed[fstr.current_index] != u'}'):
+ fstr.unparsed[fstr.current_index] != '}'):
astbuilder.error("f-string: expecting '}'", atom_node)
def f_string_to_ast_node(astbuilder, joined_pieces, atom_node):
# remove empty Strs
+ space = astbuilder.space
values = [node for node in joined_pieces
- if not (isinstance(node, ast.Str) and not node.s)]
+ if not isinstance(node, ast.Str)
+ or space.is_true(node.s)]
if len(values) > 1:
return ast.JoinedStr(values, atom_node.get_lineno(),
atom_node.get_column())
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py
b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -1194,6 +1194,38 @@
raises(SyntaxError, self.run, "f'''{5)\n#}'''")
raises(SyntaxError, self.run, "f'\\x'")
+ def test_fstring_encoding(self):
+ src = """# -*- coding: latin-1 -*-\nz=ord(f'{"\xd8"}')\n"""
+ yield self.st, src, 'z', 0xd8
+ src = """# -*- coding: utf-8 -*-\nz=ord(f'{"\xc3\x98"}')\n"""
+ yield self.st, src, 'z', 0xd8
+
+ src = """z=ord(f'\\xd8')"""
+ yield self.st, src, 'z', 0xd8
+ src = """z=ord(f'\\u00d8')"""
+ yield self.st, src, 'z', 0xd8
+
+ src = """# -*- coding: latin-1 -*-\nz=ord(f'\xd8')\n"""
+ yield self.st, src, 'z', 0xd8
+ src = """# -*- coding: utf-8 -*-\nz=ord(f'\xc3\x98')\n"""
+ yield self.st, src, 'z', 0xd8
+
+ def test_fstring_encoding_r(self):
+ src = """# -*- coding: latin-1 -*-\nz=ord(fr'{"\xd8"}')\n"""
+ yield self.st, src, 'z', 0xd8
+ src = """# -*- coding: utf-8 -*-\nz=ord(rf'{"\xc3\x98"}')\n"""
+ yield self.st, src, 'z', 0xd8
+
+ src = """z=fr'\\xd8'"""
+ yield self.st, src, 'z', "\\xd8"
+ src = """z=rf'\\u00d8'"""
+ yield self.st, src, 'z', "\\u00d8"
+
+ src = """# -*- coding: latin-1 -*-\nz=ord(rf'\xd8')\n"""
+ yield self.st, src, 'z', 0xd8
+ src = """# -*- coding: utf-8 -*-\nz=ord(fr'\xc3\x98')\n"""
+ yield self.st, src, 'z', 0xd8
+
class AppTestCompiler:
diff --git a/pypy/interpreter/pyparser/parsestring.py
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -7,7 +7,7 @@
class W_FString(W_Root):
def __init__(self, unparsed, raw_mode):
- assert isinstance(unparsed, unicode)
+ assert isinstance(unparsed, str) # utf-8 encoded string
self.unparsed = unparsed # but the quotes are removed
self.raw_mode = raw_mode
self.current_index = 0 # for astcompiler.fstring
@@ -85,14 +85,13 @@
q -= 2
if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now
+ assert 0 <= ps <= q
+ if saw_f:
+ return W_FString(s[ps:q], rawmode)
if encoding is None:
- assert 0 <= ps <= q
substr = s[ps:q]
else:
substr = decode_unicode_utf8(space, s, ps, q)
- if saw_f:
- v = unicodehelper.decode_utf8(space, substr)
- return W_FString(v, rawmode)
v = unicodehelper.decode_unicode_escape(space, substr)
return space.wrap(v)
@@ -109,10 +108,10 @@
if rawmode or '\\' not in substr:
if not unicode_literal:
return space.newbytes(substr)
+ elif saw_f:
+ return W_FString(substr, rawmode)
else:
v = unicodehelper.decode_utf8(space, substr)
- if saw_f:
- return W_FString(v, rawmode)
return space.wrap(v)
v = PyString_DecodeEscape(space, substr, 'strict', encoding)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit