[pypy-commit] pypy unicode-utf8-py3: test, refactor logic around invalid continuation byte

mattip Wed, 23 Jan 2019 06:46:58 -0800

Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95700:ee5c96ad9ed6
Date: 2019-01-23 11:30 +0200
http://bitbucket.org/pypy/pypy/changeset/ee5c96ad9ed6/


Log:    test, refactor logic around invalid continuation byte

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -328,7 +328,7 @@
         res = rutf8.utf8_encode_mbcs(s, errors, errorhandler,
                                      force_replace=False)
         return res
-        
+
     def str_decode_mbcs(s, errors, final, errorhandler, force_ignore=True):
         slen = len(s)
         res, size = runicode.str_decode_mbcs(s, slen, errors, final=final,
@@ -345,21 +345,18 @@
     res = StringBuilder(slen)
     pos = 0
     end = len(s)
-    suppressing = False # we are in a chain of "bad" unicode, only emit one fix
     while pos < end:
         ordch1 = ord(s[pos])
         # fast path for ASCII
         if ordch1 <= 0x7F:
             pos += 1
             res.append(chr(ordch1))
-            suppressing = False
             continue
 
         if ordch1 <= 0xC1:
             r, pos, rettype = errorhandler(errors, "utf8", "invalid start 
byte",
                     s, pos, pos + 1)
-            if not suppressing:
-                res.append(r)
+            res.append(r)
             continue
 
         pos += 1
@@ -371,16 +368,14 @@
                     break
                 r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
                     s, pos - 1, pos)
-                if not suppressing:
-                    res.append(r)
+                res.append(r)
                 continue
             ordch2 = ord(s[pos])
 
             if rutf8._invalid_byte_2_of_2(ordch2):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
-                if not suppressing:
-                    res.append(r)
+                res.append(r)
                 continue
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             pos += 1
@@ -396,10 +391,13 @@
                 if (pos) < end and  rutf8._invalid_byte_2_of_3(ordch1,
                                                 ord(s[pos]), allow_surrogates):
                     msg = "invalid continuation byte"
+                    r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+                                                   pos - 1, pos)
                 else:
                     msg = "unexpected end of data"
-                    suppressing = True
-                r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos - 
1, pos)
+                    r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+                                                   pos - 1, pos)
+                    pos = end
                 res.append(r)
                 continue
             ordch2 = ord(s[pos])
@@ -408,14 +406,12 @@
             if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
-                if not suppressing:
-                    res.append(r)
+                res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_3(ordch3):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 1)
-                if not suppressing:
-                    res.append(r)
+                res.append(r)
                 continue
             pos += 2
 
@@ -423,7 +419,6 @@
             res.append(chr(ordch1))
             res.append(chr(ordch2))
             res.append(chr(ordch3))
-            suppressing = False
             continue
 
         if ordch1 <= 0xF4:
@@ -433,23 +428,27 @@
                     break
                 if pos < end and rutf8._invalid_byte_2_of_4(ordch1, 
ord(s[pos])):
                     msg = "invalid continuation byte"
+                    r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+                                                   pos - 1, pos)
                 elif pos + 1 < end and rutf8._invalid_byte_3_of_4(ord(s[pos + 
1])):
                     msg = "invalid continuation byte"
+                    pos += 1
+                    r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+                                                   pos - 2, pos)
                 else:
                     msg = "unexpected end of data"
-                suppressing = True
-                r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos - 
1, pos)
+                    r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+                                                   pos - 1, pos)
+                    pos = end
                 res.append(r)
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
             ordch4 = ord(s[pos + 2])
-
             if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
-                if not suppressing:
-                    res.append(r)
+                res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_4(ordch3):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
@@ -459,8 +458,7 @@
             elif rutf8._invalid_byte_4_of_4(ordch4):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 2)
-                if not suppressing:
-                    res.append(r)
+                res.append(r)
                 continue
 
             pos += 3
@@ -469,13 +467,11 @@
             res.append(chr(ordch2))
             res.append(chr(ordch3))
             res.append(chr(ordch4))
-            suppressing = False
             continue
 
         r, pos, rettype = errorhandler(errors, "utf8", "invalid start byte",
                 s, pos - 1, pos)
-        if not suppressing:
-            res.append(r)
+        res.append(r)
 
     r = res.build()
     return r, rutf8.check_utf8(r, True), pos
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -230,7 +230,7 @@
             (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
              '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
         ]
-        for n, (seq, res) in enumerate(sequences):
+        for (seq, res) in sequences:
             raises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
             uni = seq.decode('utf-8', 'replace')
             assert uni == res
@@ -329,7 +329,6 @@
         err = 'invalid continuation byte'
         for s, res in sequences:
             seq = bytes(int(c, 16) for c in s.split())
-            print(seq, [hex(ord(c)) for c in res])
             exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
             assert err in str(exc.value)
             assert seq.decode('utf-8', 'replace') == res
@@ -340,6 +339,95 @@
             assert((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore') ==
                           'aaaa' + res + 'bbbb')
 
+    def test_invalid_cb_for_4bytes_seq(self):
+        """
+        Test that an 'invalid continuation byte' error is raised when the
+        continuation byte(s) of a 4-bytes sequence are invalid.  When
+        errors='replace',the start byte and all the following valid
+        continuation bytes are replaced with a single U+FFFD, and all the bytes
+        starting from the first invalid continuation bytes (included) are
+        handled separately.
+        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
+        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
+        because it's the ASCII letter 'A'.
+        Note: when the start byte is E0 or ED, the valid ranges for the first
+        continuation byte are limited to A0..BF and 80..9F respectively.
+        However, when the start byte is ED, Python 2 considers all the bytes
+        in range 80..BF valid.  This is fixed in Python 3.
+        """
+        FFFD = '\ufffd'
+        FFFDx2 = FFFD * 2
+        sequences = [
+            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
+            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
+            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
+            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
+            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
+            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
+            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
+            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
+            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
+            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
+            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
+            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
+            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
+            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
+            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
+            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
+            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
+            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
+            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
+            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
+            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
+            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
+            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
+            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
+            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
+            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
+            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
+            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
+            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
+            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
+            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
+            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
+            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
+            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
+            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
+            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
+            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
+            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
+            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
+            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
+            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
+            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
+            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
+            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
+            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
+            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
+            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
+            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
+            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
+            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
+            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
+            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
+            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
+            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
+            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
+        ]
+        err = 'invalid continuation byte'
+        for s, res in sequences:
+            seq = bytes(int(c, 16) for c in s.split())
+            exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
+            assert err in str(exc.value)
+            assert seq.decode('utf-8', 'replace') == res
+            assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') == 
+                         'aaaa' + res + 'bbbb')
+            res = res.replace('\ufffd', '')
+            assert seq.decode('utf-8', 'ignore') == res
+            assert((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore') ==
+                          'aaaa' + res + 'bbbb')
+
+
 class AppTestPartialEvaluation:
     spaceconfig = dict(usemodules=['array',])
 
@@ -612,7 +700,6 @@
         exc = raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
 
         exc = raises(UnicodeDecodeError, codecs.decode, b'\xe0\x00', 'utf-8')
-        print(dir(exc.value))
         assert 'invalid continuation byte' in exc.value.reason
 
     def test_bad_errorhandler_return(self):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: test, refactor logic around invalid continuation byte

Reply via email to