Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r90279:94e61b03c50a Date: 2017-02-21 19:14 +0100 http://bitbucket.org/pypy/pypy/changeset/94e61b03c50a/
Log: Backed out changeset 5e8ef7ce3887 Turns out that Python 2's utf-8 decoding is "wrong", and runicode.py implements a more correct version which follows Python 3 more closely. This is documented in the docstring of test_invalid_cb_for_3bytes_seq(). diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -133,6 +133,23 @@ def _invalid_cont_byte(ordch): return ordch>>6 != 0x2 # 0b10 +_invalid_byte_2_of_2 = _invalid_cont_byte +_invalid_byte_3_of_3 = _invalid_cont_byte +_invalid_byte_3_of_4 = _invalid_cont_byte +_invalid_byte_4_of_4 = _invalid_cont_byte + +@enforceargs(allow_surrogates=bool) +def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): + return (ordch2>>6 != 0x2 or # 0b10 + (ordch1 == 0xe0 and ordch2 < 0xa0) + # surrogates shouldn't be valid UTF-8! + or (ordch1 == 0xed and ordch2 > 0x9f and not allow_surrogates)) + +def _invalid_byte_2_of_4(ordch1, ordch2): + return (ordch2>>6 != 0x2 or # 0b10 + (ordch1 == 0xf0 and ordch2 < 0x90) or + (ordch1 == 0xf4 and ordch2 > 0x8f)) + def str_decode_utf_8_impl(s, size, errors, final, errorhandler, allow_surrogates): if size == 0: @@ -153,20 +170,60 @@ if pos + n > size: if not final: break + # argh, this obscure block of code is mostly a copy of + # what follows :-( charsleft = size - pos - 1 # either 0, 1, 2 # note: when we get the 'unexpected end of data' we need # to care about the pos returned; it can be lower than size, # in case we need to continue running this loop - endpos = pos + 1 - if charsleft >= 1 and not _invalid_cont_byte(ord(s[pos+1])): - endpos = pos + 2 - if charsleft >= 2 and not _invalid_cont_byte(ord(s[pos+2])): - endpos = pos + 3 - r, pos = errorhandler(errors, 'utf8', - 'unexpected end of data', - s, pos, endpos) - result.append(r) - continue + if not charsleft: + # there's only the start byte and nothing else + r, pos = errorhandler(errors, 'utf8', + 'unexpected end of data', + s, pos, pos+1) + result.append(r) + continue + ordch2 = ord(s[pos+1]) + if n == 3: + # 3-bytes seq with only a continuation byte + if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): + # second byte invalid, take the first and continue + r, pos = errorhandler(errors, 'utf8', + 'invalid continuation byte', + s, pos, pos+1) + result.append(r) + continue + else: + # second byte valid, but third byte missing + r, pos = errorhandler(errors, 'utf8', + 'unexpected end of data', + s, pos, pos+2) + result.append(r) + continue + elif n == 4: + # 4-bytes seq with 1 or 2 continuation bytes + if _invalid_byte_2_of_4(ordch1, ordch2): + # second byte invalid, take the first and continue + r, pos = errorhandler(errors, 'utf8', + 'invalid continuation byte', + s, pos, pos+1) + result.append(r) + continue + elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])): + # third byte invalid, take the first two and continue + r, pos = errorhandler(errors, 'utf8', + 'invalid continuation byte', + s, pos, pos+2) + result.append(r) + continue + else: + # there's only 1 or 2 valid cb, but the others are missing + r, pos = errorhandler(errors, 'utf8', + 'unexpected end of data', + s, pos, pos+charsleft+1) + result.append(r) + continue + raise AssertionError("unreachable") if n == 0: r, pos = errorhandler(errors, 'utf8', @@ -179,7 +236,7 @@ elif n == 2: ordch2 = ord(s[pos+1]) - if _invalid_cont_byte(ordch2): + if _invalid_byte_2_of_2(ordch2): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) @@ -193,48 +250,41 @@ elif n == 3: ordch2 = ord(s[pos+1]) ordch3 = ord(s[pos+2]) - if _invalid_cont_byte(ordch2): + if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif _invalid_cont_byte(ordch3): + elif _invalid_byte_3_of_3(ordch3): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+2) result.append(r) continue # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz - c = (((ordch1 & 0x0F) << 12) + # 0b00001111 - ((ordch2 & 0x3F) << 6) + # 0b00111111 - (ordch3 & 0x3F)) # 0b00111111 - if c < 2048 or (0xd800 <= c <= 0xdfff and not allow_surrogates): - r, pos = errorhandler(errors, 'utf8', - 'invalid continuation byte', - s, pos, pos+2) - result.append(r) - continue - result.append(unichr(c)) + result.append(unichr(((ordch1 & 0x0F) << 12) + # 0b00001111 + ((ordch2 & 0x3F) << 6) + # 0b00111111 + (ordch3 & 0x3F))) # 0b00111111 pos += 3 elif n == 4: ordch2 = ord(s[pos+1]) ordch3 = ord(s[pos+2]) ordch4 = ord(s[pos+3]) - if _invalid_cont_byte(ordch2): + if _invalid_byte_2_of_4(ordch1, ordch2): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif _invalid_cont_byte(ordch3): + elif _invalid_byte_3_of_4(ordch3): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+2) result.append(r) continue - elif _invalid_cont_byte(ordch4): + elif _invalid_byte_4_of_4(ordch4): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+3) @@ -245,12 +295,6 @@ ((ordch2 & 0x3F) << 12) + # 0b00111111 ((ordch3 & 0x3F) << 6) + # 0b00111111 (ordch4 & 0x3F)) # 0b00111111 - if c <= 65535 or c > 0x10ffff: - r, pos = errorhandler(errors, 'utf8', - 'invalid continuation byte', - s, pos, pos+3) - result.append(r) - continue if c <= MAXUNICODE: result.append(UNICHR(c)) else: diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -700,27 +700,6 @@ assert decoder(seq, len(seq), 'ignore', final=True ) == (res, len(seq)) - @settings(max_examples=10000) - @given(strategies.binary()) - def test_str_check_utf8(self, s): - try: - u = s.decode("utf8") - valid = True - except UnicodeDecodeError as e: - valid = False - try: - result, length = runicode.str_decode_utf_8(s, len(s), None, - errorhandler=None, final=True, allow_surrogates=True) - except UnicodeDecodeError as a: - assert not valid - assert a.start == e.start - assert a.end == e.end - assert str(a) == str(e) - else: - assert valid - assert result == u - assert length == len(s) - class TestEncoding(UnicodeTests): def test_all_ascii(self): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit