[pypy-commit] pypy py3k: Fix int("\ud800") and float("\ud800")

arigo Wed, 27 Jul 2016 11:35:39 -0700

Author: Armin Rigo <ar...@tunes.org>
Branch: py3k
Changeset: r85883:f1508f8d4bf6
Date: 2016-07-27 20:36 +0200
http://bitbucket.org/pypy/pypy/changeset/f1508f8d4bf6/


Log:    Fix int("\ud800") and float("\ud800")

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -141,9 +141,7 @@
     return result
 
 def encode_utf8(space, uni, allow_surrogates=False):
-    # Note that this function never raises UnicodeEncodeError,
-    # since surrogate pairs are allowed.
-    # This is not the case with Python3.
+    # Note that Python3 tends to forbid lone surrogates
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=encode_error_handler(space),
diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py
--- a/pypy/objspace/std/intobject.py
+++ b/pypy/objspace/std/intobject.py
@@ -871,8 +871,15 @@
             return _from_intlike(space, w_inttype, space.trunc(w_value))
         elif space.isinstance_w(w_value, space.w_unicode):
             from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
-            return _string_to_int_or_long(space, w_inttype, w_value,
-                                          unicode_to_decimal_w(space, w_value))
+            try:
+                b = unicode_to_decimal_w(space, w_value)
+            except OperationError as e:
+                if not e.match(space, space.w_UnicodeEncodeError):
+                    raise
+                raise oefmt(space.w_ValueError,
+                            "int() called with a string containing a "
+                            "lone surrogate")
+            return _string_to_int_or_long(space, w_inttype, w_value, b)
         elif (space.isinstance_w(w_value, space.w_bytearray) or
               space.isinstance_w(w_value, space.w_bytes)):
             return _string_to_int_or_long(space, w_inttype, w_value,
diff --git a/pypy/objspace/std/test/test_floatobject.py 
b/pypy/objspace/std/test/test_floatobject.py
--- a/pypy/objspace/std/test/test_floatobject.py
+++ b/pypy/objspace/std/test/test_floatobject.py
@@ -149,6 +149,8 @@
         assert float(memoryview(b"inf")) == inf
         assert float(bytearray(b"inf")) == inf
 
+        raises(UnicodeEncodeError, float, u"\ud800")
+
     def test_float_unicode(self):
         # u00A0 and u2000 are some kind of spaces
         assert 42.75 == float(chr(0x00A0)+str("42.75")+chr(0x2000))
diff --git a/pypy/objspace/std/test/test_longobject.py 
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -415,3 +415,6 @@
         assert a is not b
         b -= 1
         assert a is b
+
+    def test_invalid_surrogate(self):
+        raises(ValueError, int, u"\u8000")
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1264,8 +1264,12 @@
 # using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does.
 #
 # In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
-# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the
-# equivalent plus the final step of encoding the result to utf-8.
+# to _PyUnicode_TransformDecimalAndSpaceToASCII, which is much simpler.
+# We do that here plus the final step of encoding the result to utf-8.
+# This final step corresponds to encode_utf8 *without* allow_surrogates.
+# In float.__new__() and complex.__new__(), a lone surrogate will throw
+# an app-level UnicodeEncodeError.  In long.__new__(), though, CPython3
+# gives inconsistently a ValueError, so we handle that case in intobject.py.
 def unicode_to_decimal_w(space, w_unistr):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
@@ -1282,7 +1286,8 @@
             except KeyError:
                 pass
         result[i] = unichr(uchr)
-    return unicodehelper.encode_utf8(space, u''.join(result), 
allow_surrogates=True)
+    return unicodehelper.encode_utf8(space, u''.join(result),
+                                     allow_surrogates=False)
 
 
 _repr_function, _ = make_unicode_escape_function(
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3k: Fix int("\ud800") and float("\ud800")

Reply via email to