Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3.5 Changeset: r93413:b0267eee69d8 Date: 2017-12-13 10:04 +0100 http://bitbucket.org/pypy/pypy/changeset/b0267eee69d8/
Log: The py3k version of the utf32 decoder should not allow lone surrogates. diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -489,21 +489,21 @@ return result, length def py3k_str_decode_utf_16(s, size, errors, final=True, - errorhandler=None): + errorhandler=None): result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, errorhandler, "native", 'utf-16-' + BYTEORDER2) return result, length def py3k_str_decode_utf_16_be(s, size, errors, final=True, - errorhandler=None): + errorhandler=None): result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, errorhandler, "big", 'utf-16-be') return result, length def py3k_str_decode_utf_16_le(s, size, errors, final=True, - errorhandler=None): + errorhandler=None): result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, errorhandler, "little", 'utf-16-le') @@ -714,41 +714,41 @@ def str_decode_utf_32(s, size, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final, - errorhandler, "native") + result, length, byteorder = str_decode_utf_32_helper( + s, size, errors, final, errorhandler, "native") return result, length def str_decode_utf_32_be(s, size, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final, - errorhandler, "big") + result, length, byteorder = str_decode_utf_32_helper( + s, size, errors, final, errorhandler, "big") return result, length def str_decode_utf_32_le(s, size, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final, - errorhandler, "little") + result, length, byteorder = str_decode_utf_32_helper( + s, size, errors, final, errorhandler, "little") return result, length def py3k_str_decode_utf_32(s, size, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final, - errorhandler, "native", - 'utf-32-' + BYTEORDER2) + result, length, byteorder = str_decode_utf_32_helper( + s, size, errors, final, errorhandler, "native", + 'utf-32-' + BYTEORDER2, allow_surrogates=False) return result, length def py3k_str_decode_utf_32_be(s, size, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final, - errorhandler, "big", - 'utf-32-be') + result, length, byteorder = str_decode_utf_32_helper( + s, size, errors, final, errorhandler, "big", + 'utf-32-be', allow_surrogates=False) return result, length def py3k_str_decode_utf_32_le(s, size, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final, - errorhandler, "little", - 'utf-32-le') + result, length, byteorder = str_decode_utf_32_helper( + s, size, errors, final, errorhandler, "little", + 'utf-32-le', allow_surrogates=False) return result, length BOM32_DIRECT = intmask(0x0000FEFF) @@ -757,7 +757,8 @@ def str_decode_utf_32_helper(s, size, errors, final=True, errorhandler=None, byteorder="native", - public_encoding_name='utf32'): + public_encoding_name='utf32', + allow_surrogates=True): if errorhandler is None: errorhandler = default_unicode_error_decode bo = 0 @@ -821,7 +822,13 @@ continue ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) | (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]])) - if ch >= 0x110000: + if not allow_surrogates and 0xD800 <= ch <= 0xDFFFF: + r, pos = errorhandler(errors, public_encoding_name, + "code point in surrogate code point " + "range(0xd800, 0xe000)", + s, pos, len(s)) + result.append(r) + elif ch >= 0x110000: r, pos = errorhandler(errors, public_encoding_name, "codepoint not in range(0x110000)", s, pos, len(s)) diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -284,6 +284,11 @@ errorhandler, allow_surrogates=False) assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be') assert replace_with(None, '\xca\xfe\xca\xfe') == '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>' + # + assert runicode.str_decode_utf_32_be( + b"\x00\x00\xdc\x80", 4, None) == (u'\udc80', 4) + py.test.raises(UnicodeDecodeError, runicode.py3k_str_decode_utf_32_be, + b"\x00\x00\xdc\x80", 4, None) def test_utf7_bugs(self): u = u'A\u2262\u0391.' _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit