Author: Armin Rigo <ar...@tunes.org>
Branch: 
Changeset: r90279:94e61b03c50a
Date: 2017-02-21 19:14 +0100
http://bitbucket.org/pypy/pypy/changeset/94e61b03c50a/

Log:    Backed out changeset 5e8ef7ce3887

        Turns out that Python 2's utf-8 decoding is "wrong", and runicode.py
        implements a more correct version which follows Python 3 more
        closely. This is documented in the docstring of
        test_invalid_cb_for_3bytes_seq().

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -133,6 +133,23 @@
 def _invalid_cont_byte(ordch):
     return ordch>>6 != 0x2    # 0b10
 
+_invalid_byte_2_of_2 = _invalid_cont_byte
+_invalid_byte_3_of_3 = _invalid_cont_byte
+_invalid_byte_3_of_4 = _invalid_cont_byte
+_invalid_byte_4_of_4 = _invalid_cont_byte
+
+@enforceargs(allow_surrogates=bool)
+def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+    return (ordch2>>6 != 0x2 or    # 0b10
+            (ordch1 == 0xe0 and ordch2 < 0xa0)
+            # surrogates shouldn't be valid UTF-8!
+            or (ordch1 == 0xed and ordch2 > 0x9f and not allow_surrogates))
+
+def _invalid_byte_2_of_4(ordch1, ordch2):
+    return (ordch2>>6 != 0x2 or    # 0b10
+            (ordch1 == 0xf0 and ordch2 < 0x90) or
+            (ordch1 == 0xf4 and ordch2 > 0x8f))
+
 def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
                           allow_surrogates):
     if size == 0:
@@ -153,20 +170,60 @@
         if pos + n > size:
             if not final:
                 break
+            # argh, this obscure block of code is mostly a copy of
+            # what follows :-(
             charsleft = size - pos - 1 # either 0, 1, 2
             # note: when we get the 'unexpected end of data' we need
             # to care about the pos returned; it can be lower than size,
             # in case we need to continue running this loop
-            endpos = pos + 1
-            if charsleft >= 1 and not _invalid_cont_byte(ord(s[pos+1])):
-                endpos = pos + 2
-                if charsleft >= 2 and not _invalid_cont_byte(ord(s[pos+2])):
-                    endpos = pos + 3
-            r, pos = errorhandler(errors, 'utf8',
-                                  'unexpected end of data',
-                                  s, pos, endpos)
-            result.append(r)
-            continue
+            if not charsleft:
+                # there's only the start byte and nothing else
+                r, pos = errorhandler(errors, 'utf8',
+                                      'unexpected end of data',
+                                      s, pos, pos+1)
+                result.append(r)
+                continue
+            ordch2 = ord(s[pos+1])
+            if n == 3:
+                # 3-bytes seq with only a continuation byte
+                if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+                    # second byte invalid, take the first and continue
+                    r, pos = errorhandler(errors, 'utf8',
+                                          'invalid continuation byte',
+                                          s, pos, pos+1)
+                    result.append(r)
+                    continue
+                else:
+                    # second byte valid, but third byte missing
+                    r, pos = errorhandler(errors, 'utf8',
+                                      'unexpected end of data',
+                                      s, pos, pos+2)
+                    result.append(r)
+                    continue
+            elif n == 4:
+                # 4-bytes seq with 1 or 2 continuation bytes
+                if _invalid_byte_2_of_4(ordch1, ordch2):
+                    # second byte invalid, take the first and continue
+                    r, pos = errorhandler(errors, 'utf8',
+                                          'invalid continuation byte',
+                                          s, pos, pos+1)
+                    result.append(r)
+                    continue
+                elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])):
+                    # third byte invalid, take the first two and continue
+                    r, pos = errorhandler(errors, 'utf8',
+                                          'invalid continuation byte',
+                                          s, pos, pos+2)
+                    result.append(r)
+                    continue
+                else:
+                    # there's only 1 or 2 valid cb, but the others are missing
+                    r, pos = errorhandler(errors, 'utf8',
+                                      'unexpected end of data',
+                                      s, pos, pos+charsleft+1)
+                    result.append(r)
+                    continue
+            raise AssertionError("unreachable")
 
         if n == 0:
             r, pos = errorhandler(errors, 'utf8',
@@ -179,7 +236,7 @@
 
         elif n == 2:
             ordch2 = ord(s[pos+1])
-            if _invalid_cont_byte(ordch2):
+            if _invalid_byte_2_of_2(ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
@@ -193,48 +250,41 @@
         elif n == 3:
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
-            if _invalid_cont_byte(ordch2):
+            if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
                 result.append(r)
                 continue
-            elif _invalid_cont_byte(ordch3):
+            elif _invalid_byte_3_of_3(ordch3):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
                 result.append(r)
                 continue
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
-            c = (((ordch1 & 0x0F) << 12) +     # 0b00001111
-                 ((ordch2 & 0x3F) << 6) +      # 0b00111111
-                 (ordch3 & 0x3F))              # 0b00111111
-            if c < 2048 or (0xd800 <= c <= 0xdfff and not allow_surrogates):
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+2)
-                result.append(r)
-                continue
-            result.append(unichr(c))
+            result.append(unichr(((ordch1 & 0x0F) << 12) +     # 0b00001111
+                                 ((ordch2 & 0x3F) << 6) +      # 0b00111111
+                                 (ordch3 & 0x3F)))             # 0b00111111
             pos += 3
 
         elif n == 4:
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
             ordch4 = ord(s[pos+3])
-            if _invalid_cont_byte(ordch2):
+            if _invalid_byte_2_of_4(ordch1, ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
                 result.append(r)
                 continue
-            elif _invalid_cont_byte(ordch3):
+            elif _invalid_byte_3_of_4(ordch3):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
                 result.append(r)
                 continue
-            elif _invalid_cont_byte(ordch4):
+            elif _invalid_byte_4_of_4(ordch4):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+3)
@@ -245,12 +295,6 @@
                  ((ordch2 & 0x3F) << 12) +      # 0b00111111
                  ((ordch3 & 0x3F) << 6) +       # 0b00111111
                  (ordch4 & 0x3F))               # 0b00111111
-            if c <= 65535 or c > 0x10ffff:
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+3)
-                result.append(r)
-                continue
             if c <= MAXUNICODE:
                 result.append(UNICHR(c))
             else:
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -700,27 +700,6 @@
             assert decoder(seq, len(seq), 'ignore', final=True
                            ) == (res, len(seq))
 
-    @settings(max_examples=10000)
-    @given(strategies.binary())
-    def test_str_check_utf8(self, s):
-        try:
-            u = s.decode("utf8")
-            valid = True
-        except UnicodeDecodeError as e:
-            valid = False
-        try:
-            result, length = runicode.str_decode_utf_8(s, len(s), None,
-                errorhandler=None, final=True, allow_surrogates=True)
-        except UnicodeDecodeError as a:
-            assert not valid
-            assert a.start == e.start
-            assert a.end == e.end
-            assert str(a) == str(e)
-        else:
-            assert valid
-            assert result == u
-            assert length == len(s)
-
 
 class TestEncoding(UnicodeTests):
     def test_all_ascii(self):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to