Author: Armin Rigo <ar...@tunes.org> Branch: py3k Changeset: r85883:f1508f8d4bf6 Date: 2016-07-27 20:36 +0200 http://bitbucket.org/pypy/pypy/changeset/f1508f8d4bf6/
Log: Fix int("\ud800") and float("\ud800") diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -141,9 +141,7 @@ return result def encode_utf8(space, uni, allow_surrogates=False): - # Note that this function never raises UnicodeEncodeError, - # since surrogate pairs are allowed. - # This is not the case with Python3. + # Note that Python3 tends to forbid lone surrogates return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=encode_error_handler(space), diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py --- a/pypy/objspace/std/intobject.py +++ b/pypy/objspace/std/intobject.py @@ -871,8 +871,15 @@ return _from_intlike(space, w_inttype, space.trunc(w_value)) elif space.isinstance_w(w_value, space.w_unicode): from pypy.objspace.std.unicodeobject import unicode_to_decimal_w - return _string_to_int_or_long(space, w_inttype, w_value, - unicode_to_decimal_w(space, w_value)) + try: + b = unicode_to_decimal_w(space, w_value) + except OperationError as e: + if not e.match(space, space.w_UnicodeEncodeError): + raise + raise oefmt(space.w_ValueError, + "int() called with a string containing a " + "lone surrogate") + return _string_to_int_or_long(space, w_inttype, w_value, b) elif (space.isinstance_w(w_value, space.w_bytearray) or space.isinstance_w(w_value, space.w_bytes)): return _string_to_int_or_long(space, w_inttype, w_value, diff --git a/pypy/objspace/std/test/test_floatobject.py b/pypy/objspace/std/test/test_floatobject.py --- a/pypy/objspace/std/test/test_floatobject.py +++ b/pypy/objspace/std/test/test_floatobject.py @@ -149,6 +149,8 @@ assert float(memoryview(b"inf")) == inf assert float(bytearray(b"inf")) == inf + raises(UnicodeEncodeError, float, u"\ud800") + def test_float_unicode(self): # u00A0 and u2000 are some kind of spaces assert 42.75 == float(chr(0x00A0)+str("42.75")+chr(0x2000)) diff --git a/pypy/objspace/std/test/test_longobject.py b/pypy/objspace/std/test/test_longobject.py --- a/pypy/objspace/std/test/test_longobject.py +++ b/pypy/objspace/std/test/test_longobject.py @@ -415,3 +415,6 @@ assert a is not b b -= 1 assert a is b + + def test_invalid_surrogate(self): + raises(ValueError, int, u"\u8000") diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1264,8 +1264,12 @@ # using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does. # # In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call -# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the -# equivalent plus the final step of encoding the result to utf-8. +# to _PyUnicode_TransformDecimalAndSpaceToASCII, which is much simpler. +# We do that here plus the final step of encoding the result to utf-8. +# This final step corresponds to encode_utf8 *without* allow_surrogates. +# In float.__new__() and complex.__new__(), a lone surrogate will throw +# an app-level UnicodeEncodeError. In long.__new__(), though, CPython3 +# gives inconsistently a ValueError, so we handle that case in intobject.py. def unicode_to_decimal_w(space, w_unistr): if not isinstance(w_unistr, W_UnicodeObject): raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) @@ -1282,7 +1286,8 @@ except KeyError: pass result[i] = unichr(uchr) - return unicodehelper.encode_utf8(space, u''.join(result), allow_surrogates=True) + return unicodehelper.encode_utf8(space, u''.join(result), + allow_surrogates=False) _repr_function, _ = make_unicode_escape_function( _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit