[pypy-commit] pypy unicode-utf8: allow a = array.array('u', u'\xff'); a.byteswap(); ord(a[0]) > sys.maxunicode

mattip Tue, 01 Jan 2019 05:31:50 -0800

Author: Matti Picus <matti.pi...@gmail.com>
Branch: unicode-utf8
Changeset: r95554:962719fced4a
Date: 2019-01-01 15:06 +0200
http://bitbucket.org/pypy/pypy/changeset/962719fced4a/


Log:    allow a = array.array('u', u'\xff'); a.byteswap(); ord(a[0]) >
        sys.maxunicode

diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1040,13 +1040,24 @@
                 return space.newbytes(item)
             elif mytype.typecode == 'u':
                 code = r_uint(ord(item))
-                try:
-                    return space.newutf8(rutf8.unichr_as_utf8(code), 1)
-                except ValueError:
-                    raise oefmt(space.w_ValueError,
-                        "array contains a 32-bit integer that is outside "
-                        "the range [U+0000; U+10ffff] of valid unicode "
-                        "characters")
+                # cpython will allow values > sys.maxunicode
+                # while silently truncating the top bits
+                if code <= r_uint(0x7F):
+                    # Encode ASCII
+                    item = chr(code)
+                elif code <= r_uint(0x07FF):
+                    item = (chr((0xc0 | (code >> 6))) + 
+                            chr((0x80 | (code & 0x3f))))
+                elif code <= r_uint(0xFFFF):
+                    item = (chr((0xe0 | (code >> 12))) +
+                            chr((0x80 | ((code >> 6) & 0x3f))) +
+                            chr((0x80 | (code & 0x3f))))
+                else:
+                    item = (chr((0xf0 | (code >> 18)) & 0xff) +
+                            chr((0x80 | ((code >> 12) & 0x3f))) +
+                            chr((0x80 | ((code >> 6) & 0x3f))) +
+                            chr((0x80 | (code & 0x3f))))
+                return space.newutf8(item, 1)
             assert 0, "unreachable"
 
         # interface
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -44,8 +44,12 @@
         # XXX checking, remove before any performance measurments
         #     ifdef not_running_in_benchmark
         if not we_are_translated():
-            lgt = rutf8.check_utf8(utf8str, True)
-            assert lgt == length
+            try:
+                lgt = rutf8.check_utf8(utf8str, True)
+                assert lgt == length
+            except:
+                # array.array can return invalid unicode
+                pass
 
     @staticmethod
     def from_utf8builder(builder):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: allow a = array.array('u', u'\xff'); a.byteswap(); ord(a[0]) > sys.maxunicode

Reply via email to