[pypy-commit] pypy py3.6: Fix error handler calls when encoding to utf-16

rlamy Wed, 28 Aug 2019 07:43:50 -0700

Author: Ronan Lamy <[email protected]>
Branch: py3.6
Changeset: r97322:7ace73ce896a
Date: 2019-08-28 15:41 +0100
http://bitbucket.org/pypy/pypy/changeset/7ace73ce896a/


Log:    Fix error handler calls when encoding to utf-16

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1215,21 +1215,22 @@
     while pos < size:
         try:
             cp = rutf8.codepoint_at_pos(s, pos)
+            pos = rutf8.next_codepoint_pos(s, pos)
         except IndexError:
             # malformed codepoint, blindly use ch
             pos += 1
             if errorhandler:
                 r, newindex, rettype = errorhandler(
                     errors, public_encoding_name, 'malformed unicode',
-                    s, pos - 1, pos)
+                    s, index, index + 1)
                 if rettype == 'u':
                     for cp in rutf8.Utf8StringIterator(r):
                         if cp < 0xD800:
                             _STORECHAR(result, cp, byteorder)
                         else:
-                            errorhandler('strict', public_encoding_name,
-                                         'malformed unicode',
-                                     s, pos-1, pos)
+                            errorhandler(
+                                'strict', public_encoding_name,
+                                'malformed unicode', s, index, index + 1)
                 else:
                     for ch in r:
                         cp = ord(ch)
@@ -1238,7 +1239,7 @@
                         else:
                             errorhandler('strict', public_encoding_name,
                                          'malformed unicode',
-                                     s, pos-1, pos)
+                                     s, index, index + 1)
             else:
                 cp = ord(s[pos])
                 _STORECHAR(result, cp, byteorder)
@@ -1253,7 +1254,7 @@
         else:
             r, newindex, rettype = errorhandler(
                 errors, public_encoding_name, 'surrogates not allowed',
-                s, pos, pos+1)
+                s, index, index+1)
             if rettype == 'u':
                 for cp in rutf8.Utf8StringIterator(r):
                     if cp < 0xD800 or allow_surrogates:
@@ -1261,7 +1262,7 @@
                     else:
                         errorhandler('strict', public_encoding_name,
                                      'surrogates not allowed',
-                                     s, pos, pos+1)
+                                     s, index, index+1)
             else:
                 for ch in r:
                     cp = ord(ch)
@@ -1270,13 +1271,11 @@
                     else:
                         errorhandler('strict', public_encoding_name,
                                      'surrogates not allowed',
-                                 s, pos, pos+1)
+                                 s, index, index+1)
             if index != newindex:  # Should be uncommon
                 index = newindex
                 pos = rutf8._pos_at_index(s, newindex)
             continue
-
-        pos = rutf8.next_codepoint_pos(s, pos)
         index += 1
 
     return result.build()
@@ -1516,19 +1515,19 @@
     return result.build()
 
 def utf8_encode_utf_32(s, errors,
-                          errorhandler=None, allow_surrogates=True):
+                       errorhandler=None, allow_surrogates=True):
     return utf8_encode_utf_32_helper(s, errors, errorhandler,
                                         allow_surrogates, "native",
                                         'utf-32-' + BYTEORDER2)
 
 def utf8_encode_utf_32_be(s, errors,
-                                  errorhandler=None, allow_surrogates=True):
+                          errorhandler=None, allow_surrogates=True):
     return utf8_encode_utf_32_helper(s, errors, errorhandler,
                                         allow_surrogates, "big",
                                         'utf-32-be')
 
 def utf8_encode_utf_32_le(s, errors,
-                                  errorhandler=None, allow_surrogates=True):
+                          errorhandler=None, allow_surrogates=True):
     return utf8_encode_utf_32_helper(s, errors, errorhandler,
                                         allow_surrogates, "little",
                                         'utf-32-le')
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -271,10 +271,10 @@
             assert 'unexpected end of data' in str(exc.value)
             useq = bseq.decode('utf-8', 'replace')
             assert  useq == u'\ufffd', (bseq, useq)
-            assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') == 
+            assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') ==
                     u'aaaa\ufffdbbbb')
             assert bseq.decode('utf-8', 'ignore') == ''
-            assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') == 
+            assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') ==
                     u'aaaabbbb')
 
     def test_invalid_cb_for_3bytes_seq(self):
@@ -337,7 +337,7 @@
             exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
             assert err in str(exc.value)
             assert seq.decode('utf-8', 'replace') == res
-            assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') == 
+            assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
                          'aaaa' + res + 'bbbb')
             res = res.replace('\ufffd', '')
             assert seq.decode('utf-8', 'ignore') == res
@@ -425,7 +425,7 @@
             exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
             assert err in str(exc.value)
             assert seq.decode('utf-8', 'replace') == res
-            assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') == 
+            assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
                          'aaaa' + res + 'bbbb')
             res = res.replace('\ufffd', '')
             assert seq.decode('utf-8', 'ignore') == res
@@ -1148,6 +1148,11 @@
                 '[]'.encode(encoding))
             assert (u'[\udc80]'.encode(encoding, "replace") ==
                 '[?]'.encode(encoding))
+            # surrogate sequences
+            assert (u'[\ud800\udc80]'.encode(encoding, "ignore") ==
+                '[]'.encode(encoding))
+            assert (u'[\ud800\udc80]'.encode(encoding, "replace") ==
+                '[??]'.encode(encoding))
         for encoding, ill_surrogate in [('utf-8', b'\xed\xb2\x80'),
                                         ('utf-16-le', b'\x80\xdc'),
                                         ('utf-16-be', b'\xdc\x80'),
@@ -1167,7 +1172,7 @@
                 assert test_string.encode(encoding, 'surrogatepass') == 
test_sequence
                 assert test_sequence.decode(encoding, 'surrogatepass') == 
test_string
                 assert test_sequence.decode(encoding, 'ignore') == before + 
after
-                assert test_sequence.decode(encoding, 'replace') == (before + 
+                assert test_sequence.decode(encoding, 'replace') == (before +
                                                 ill_formed_sequence_replace + 
after), str(
                 (encoding, test_sequence, before + ill_formed_sequence_replace 
+ after))
                 backslashreplace = ''.join('\\x%02x' % b for b in 
ill_surrogate)
@@ -1388,7 +1393,7 @@
         # from stdlib tests, bad byte: \xa5 is unmapped in iso-8859-3
         assert (b"foo\xa5bar".decode("iso-8859-3", "surrogateescape") ==
                      "foo\udca5bar")
-        assert ("foo\udca5bar".encode("iso-8859-3", "surrogateescape") == 
+        assert ("foo\udca5bar".encode("iso-8859-3", "surrogateescape") ==
                          b"foo\xa5bar")
 
     def test_warn_escape_decode(self):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.6: Fix error handler calls when encoding to utf-16

Reply via email to