Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3.5 Changeset: r91090:7ce52a9f8d1f Date: 2017-04-19 00:53 +0200 http://bitbucket.org/pypy/pypy/changeset/7ce52a9f8d1f/
Log: Add support for rejecting lone surrogates in utf16 and utf32 decoders. diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -591,7 +591,10 @@ def unicode_encode_utf_16_helper(s, size, errors, errorhandler=None, + allow_surrogates=True, byteorder='little'): + if errorhandler is None: + errorhandler = default_unicode_error_encode if size == 0: if byteorder == 'native': result = StringBuilder(2) @@ -604,34 +607,60 @@ _STORECHAR(result, 0xFEFF, BYTEORDER) byteorder = BYTEORDER - i = 0 - while i < size: - ch = ord(s[i]) - i += 1 - ch2 = 0 - if ch >= 0x10000: - ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF) - ch = 0xD800 | ((ch-0x10000) >> 10) + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 - _STORECHAR(result, ch, byteorder) - if ch2: - _STORECHAR(result, ch2, byteorder) + if ch < 0xD800: + _STORECHAR(result, ch, byteorder) + elif ch >= 0x10000: + _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder) + _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder) + elif ch >= 0xE000 or allow_surrogates: + _STORECHAR(result, ch, byteorder) + else: + ru, rs, pos = errorhandler(errors, 'utf16', + 'surrogates not allowed', + s, pos-1, pos) + if rs is not None: + # py3k only + if len(rs) % 2 != 0: + errorhandler('strict', 'utf16', + 'surrogates not allowed', + s, pos-1, pos) + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0xD800: + _STORECHAR(result, ord(ch), byteorder) + else: + errorhandler('strict', 'utf16', + 'surrogates not allowed', + s, pos-1, pos) + continue return result.build() def unicode_encode_utf_16(s, size, errors, - errorhandler=None): - return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native") + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + allow_surrogates, "native") def unicode_encode_utf_16_be(s, size, errors, - errorhandler=None): - return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big") + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + allow_surrogates, "big") def unicode_encode_utf_16_le(s, size, errors, - errorhandler=None): - return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little") + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + allow_surrogates, "little") # ____________________________________________________________ @@ -756,7 +785,10 @@ def unicode_encode_utf_32_helper(s, size, errors, errorhandler=None, + allow_surrogates=True, byteorder='little'): + if errorhandler is None: + errorhandler = default_unicode_error_encode if size == 0: if byteorder == 'native': result = StringBuilder(4) @@ -769,33 +801,57 @@ _STORECHAR32(result, 0xFEFF, BYTEORDER) byteorder = BYTEORDER - i = 0 - while i < size: - ch = ord(s[i]) - i += 1 + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 ch2 = 0 - if MAXUNICODE < 65536 and 0xD800 <= ch <= 0xDBFF and i < size: - ch2 = ord(s[i]) - if 0xDC00 <= ch2 <= 0xDFFF: - ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; - i += 1 + if 0xD800 <= ch < 0xDC00: + if not allow_surrogates: + ru, rs, pos = errorhandler(errors, 'utf32', + 'surrogates not allowed', + s, pos-1, pos) + if rs is not None: + # py3k only + if len(rs) % 4 != 0: + errorhandler('strict', 'utf32', + 'surrogates not allowed', + s, pos-1, pos) + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0xD800: + _STORECHAR32(result, ord(ch), byteorder) + else: + errorhandler('strict', 'utf32', + 'surrogates not allowed', + s, pos-1, pos) + continue + elif MAXUNICODE < 65536 and pos < size: + ch2 = ord(s[pos]) + if 0xDC00 <= ch2 < 0xE000: + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + pos += 1 _STORECHAR32(result, ch, byteorder) return result.build() def unicode_encode_utf_32(s, size, errors, - errorhandler=None): - return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "native") + errorhandler=None, allow_surrogates=True): + return unicode_encode_utf_32_helper(s, size, errors, errorhandler, + allow_surrogates, "native") def unicode_encode_utf_32_be(s, size, errors, - errorhandler=None): - return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "big") + errorhandler=None, allow_surrogates=True): + return unicode_encode_utf_32_helper(s, size, errors, errorhandler, + allow_surrogates, "big") def unicode_encode_utf_32_le(s, size, errors, - errorhandler=None): - return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "little") + errorhandler=None, allow_surrogates=True): + return unicode_encode_utf_32_helper(s, size, errors, errorhandler, + allow_surrogates, "little") # ____________________________________________________________ diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -223,6 +223,40 @@ py.test.raises(UnicodeDecodeError, runicode.str_decode_utf_16_le, s, len(s), True) + def test_utf16_surrogates(self): + assert runicode.unicode_encode_utf_16_be( + u"\ud800", 1, None) == '\xd8\x00' + py.test.raises(UnicodeEncodeError, runicode.unicode_encode_utf_16_be, + u"\ud800", 1, None, allow_surrogates=False) + def replace_with(ru, rs): + def errorhandler(errors, enc, msg, u, startingpos, endingpos): + if errors == 'strict': + raise UnicodeEncodeError(enc, u, startingpos, + endingpos, msg) + return ru, rs, endingpos + return runicode.unicode_encode_utf_16_be( + u"<\ud800>", 3, None, + errorhandler, allow_surrogates=False) + assert replace_with(u'rep', None) == '\x00<\x00r\x00e\x00p\x00>' + assert replace_with(None, '\xca\xfe') == '\x00<\xca\xfe\x00>' + + def test_utf32_surrogates(self): + assert runicode.unicode_encode_utf_32_be( + u"\ud800", 1, None) == '\x00\x00\xd8\x00' + py.test.raises(UnicodeEncodeError, runicode.unicode_encode_utf_32_be, + u"\ud800", 1, None, allow_surrogates=False) + def replace_with(ru, rs): + def errorhandler(errors, enc, msg, u, startingpos, endingpos): + if errors == 'strict': + raise UnicodeEncodeError(enc, u, startingpos, + endingpos, msg) + return ru, rs, endingpos + return runicode.unicode_encode_utf_32_be( + u"<\ud800>", 3, None, + errorhandler, allow_surrogates=False) + assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be') + assert replace_with(None, '\xca\xfe\xca\xfe') == '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>' + def test_utf7_bugs(self): u = u'A\u2262\u0391.' assert runicode.unicode_encode_utf_7(u, len(u), None) == 'A+ImIDkQ.' _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit