[pypy-commit] pypy py3.5: Adapt 'surrogatepass' to utf16 and utf32 encodings. Other encodings fail.

amauryfa Sun, 06 Nov 2016 09:11:26 -0800

Author: Amaury Forgeot d'Arc <amaur...@gmail.com>
Branch: py3.5
Changeset: r88151:0e35b7ce07d1
Date: 2016-11-06 16:38 +0100
http://bitbucket.org/pypy/pypy/changeset/0e35b7ce07d1/


Log:    Adapt 'surrogatepass' to utf16 and utf32 encodings. Other encodings
        fail.

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,6 +1,7 @@
+import sys
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import UnicodeBuilder, StringBuilder
 from rpython.rlib.runicode import (
     code_to_unichr, MAXUNICODE,
     raw_unicode_escape_helper_unicode)
@@ -322,14 +323,55 @@
         raise oefmt(space.w_TypeError,
                     "don't know how to handle %T in error callback", w_exc)
 
+
+(ENC_UNKNOWN, ENC_UTF8,
+ ENC_UTF16BE, ENC_UTF16LE,
+ ENC_UTF32BE, ENC_UTF32LE) = range(-1, 5)
+BIG_ENDIAN = sys.byteorder == 'big'
+
+STANDARD_ENCODINGS = {
+    'utf8':      (3, ENC_UTF8),
+    'utf_8':     (3, ENC_UTF8),
+    'cp_utf8':   (3, ENC_UTF8),
+    'utf16':     (2, ENC_UTF16BE) if BIG_ENDIAN else (2, ENC_UTF16LE),
+    'utf_16':    (2, ENC_UTF16BE) if BIG_ENDIAN else (2, ENC_UTF16LE),
+    'utf16be':   (2, ENC_UTF16BE),
+    'utf_16be':  (2, ENC_UTF16BE),
+    'utf16_be':  (2, ENC_UTF16BE),
+    'utf_16_be': (2, ENC_UTF16BE),
+    'utf16le':   (2, ENC_UTF16LE),
+    'utf_16le':  (2, ENC_UTF16LE),
+    'utf16_le':  (2, ENC_UTF16LE),
+    'utf_16_le': (2, ENC_UTF16LE),
+    'utf32':     (4, ENC_UTF32BE) if BIG_ENDIAN else (4, ENC_UTF32LE),
+    'utf_32':    (4, ENC_UTF32BE) if BIG_ENDIAN else (4, ENC_UTF32LE),
+    'utf32be':   (4, ENC_UTF32BE),
+    'utf_32be':  (4, ENC_UTF32BE),
+    'utf32_be':  (4, ENC_UTF32BE),
+    'utf_32_be': (4, ENC_UTF32BE),
+    'utf32le':   (4, ENC_UTF32LE),
+    'utf_32le':  (4, ENC_UTF32LE),
+    'utf32_le':  (4, ENC_UTF32LE),
+    'utf_32_le': (4, ENC_UTF32LE),
+}
+
+def get_standard_encoding(encoding):
+    encoding = encoding.lower().replace('-', '_')
+    return STANDARD_ENCODINGS.get(encoding, (0, ENC_UNKNOWN))
+
 def surrogatepass_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
         start = space.int_w(space.getattr(w_exc, space.wrap('start')))
         w_end = space.getattr(w_exc, space.wrap('end'))
+        encoding = space.str_w(space.getattr(w_exc, space.wrap('encoding')))
+        bytelength, code = get_standard_encoding(encoding)
+        if code == ENC_UNKNOWN:
+            # Not supported, fail with original exception
+            raise OperationError(space.type(w_exc), w_exc)
         end = space.int_w(w_end)
-        res = ''
+        builder = StringBuilder()
         pos = start
         while pos < end:
             ch = ord(obj[pos])
@@ -337,31 +379,61 @@
             if ch < 0xd800 or ch > 0xdfff:
                 # Not a surrogate, fail with original exception
                 raise OperationError(space.type(w_exc), w_exc)
-            res += chr(0xe0 | (ch >> 12))
-            res += chr(0x80 | ((ch >> 6) & 0x3f))
-            res += chr(0x80 | (ch & 0x3f))
-        return space.newtuple([space.newbytes(res), w_end])
+            if code == ENC_UTF8:
+                builder.append(chr(0xe0 | (ch >> 12)))
+                builder.append(chr(0x80 | ((ch >> 6) & 0x3f)))
+                builder.append(chr(0x80 | (ch & 0x3f)))
+            elif code == ENC_UTF16LE:
+                builder.append(chr(ch & 0xff))
+                builder.append(chr(ch >> 8))
+            elif code == ENC_UTF16BE:
+                builder.append(chr(ch >> 8))
+                builder.append(chr(ch & 0xff))
+            elif code == ENC_UTF32LE:
+                builder.append(chr(ch & 0xff))
+                builder.append(chr(ch >> 8))
+                builder.append(chr(0))
+                builder.append(chr(0))
+            elif code == ENC_UTF32BE:
+                builder.append(chr(0))
+                builder.append(chr(0))
+                builder.append(chr(ch >> 8))
+                builder.append(chr(ch & 0xff))
+        return space.newtuple([space.newbytes(builder.build()), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
         start = space.int_w(space.getattr(w_exc, space.wrap('start')))
         obj = space.bytes_w(space.getattr(w_exc, space.wrap('object')))
+        encoding = space.str_w(space.getattr(w_exc, space.wrap('encoding')))
+        bytelength, code = get_standard_encoding(encoding)
         ch = 0
         # Try decoding a single surrogate character. If there are more,
         # let the codec call us again
-        ch0 = ord(obj[start + 0])
-        ch1 = ord(obj[start + 1])
+        ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1
+        ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1
         ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1
-        if (ch2 != -1 and
-            ch0 & 0xf0 == 0xe0 and
-            ch1 & 0xc0 == 0x80 and
-            ch2 & 0xc0 == 0x80):
-            # it's a three-byte code
-            ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
-            if ch < 0xd800 or ch > 0xdfff:
-                # it's not a surrogate - fail
-                ch = 0
+        ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1
+        if code == ENC_UTF8:
+            if (ch1 != -1 and ch2 != -1 and
+                ch0 & 0xf0 == 0xe0 and
+                ch1 & 0xc0 == 0x80 and
+                ch2 & 0xc0 == 0x80):
+                # it's a three-byte code
+                ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
+        elif code == ENC_UTF16LE:
+            ch = (ch1 << 8) | ch0
+        elif code == ENC_UTF16BE:
+            ch = (ch0 << 8) | ch1
+        elif code == ENC_UTF32LE:
+            ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0
+        elif code == ENC_UTF32BE:
+            ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3
+        if ch < 0xd800 or ch > 0xdfff:
+            # it's not a surrogate - fail
+            ch = 0
         if ch == 0:
             raise OperationError(space.type(w_exc), w_exc)
-        return space.newtuple([space.wrap(unichr(ch)), space.wrap(start + 3)])
+        return space.newtuple([space.wrap(unichr(ch)),
+                               space.wrap(start + bytelength)])
     else:
         raise oefmt(space.w_TypeError,
                     "don't know how to handle %T in error callback", w_exc)
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -624,6 +624,51 @@
         raises(UnicodeDecodeError, b"abc\xed\xa0z".decode, "utf-8",
                "surrogatepass")
 
+    def test_badandgoodsurrogatepassexceptions(self):
+        import codecs
+        surrogatepass_errors = codecs.lookup_error('surrogatepass')
+        # "surrogatepass" complains about a non-exception passed in
+        raises(TypeError, surrogatepass_errors, 42)
+        # "surrogatepass" complains about the wrong exception types
+        raises(TypeError, surrogatepass_errors, UnicodeError("ouch"))
+        # "surrogatepass" can not be used for translating
+        raises(TypeError, surrogatepass_errors,
+               UnicodeTranslateError("\ud800", 0, 1, "ouch"))
+        # Use the correct exception
+        for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
+            raises(UnicodeEncodeError, surrogatepass_errors,
+                   UnicodeEncodeError(enc, "a", 0, 1, "ouch"))
+            raises(UnicodeDecodeError, surrogatepass_errors,
+                   UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch"))
+        for s in ("\ud800", "\udfff", "\ud800\udfff"):
+            raises(UnicodeEncodeError, surrogatepass_errors,
+                   UnicodeEncodeError("ascii", s, 0, len(s), "ouch"))
+        tests = [
+            ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
+            ("utf-16le", "\ud800", b'\x00\xd8', 2),
+            ("utf-16be", "\ud800", b'\xd8\x00', 2),
+            ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
+            ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
+            ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
+            ("utf-16le", "\udfff", b'\xff\xdf', 2),
+            ("utf-16be", "\udfff", b'\xdf\xff', 2),
+            ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
+            ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
+            ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
+            ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
+            ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
+            ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 
4),
+            ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 
4),
+        ]
+        for enc, s, b, n in tests:
+            assert surrogatepass_errors(
+                UnicodeEncodeError(enc, "a" + s + "b", 1, 1 + len(s), "ouch")
+            ) == (b, 1 + len(s))
+            assert surrogatepass_errors(
+                UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
+                                   1, 1 + n, "ouch")
+            ) == (s[:1], 1 + n)
+
     def test_badhandler(self):
         import codecs
         results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), 
("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.5: Adapt 'surrogatepass' to utf16 and utf32 encodings. Other encodings fail.

Reply via email to