Author: Matti Picus <matti.pi...@gmail.com> Branch: py3.6 Changeset: r96063:3f907d46d82c Date: 2019-02-18 16:12 +0200 http://bitbucket.org/pypy/pypy/changeset/3f907d46d82c/
Log: merge heads diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,13 +1,6 @@ -import py -import pytest -import struct -import sys from pypy.interpreter.unicodehelper import ( - encode_utf8, decode_utf8, - unicode_encode_utf_8, - unicode_encode_utf_32_be, str_decode_utf_32_be + utf8_encode_utf_8, decode_utf8sp, ) -from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp class Hit(Exception): @@ -20,18 +13,6 @@ raise AttributeError(name) -def test_encode_utf8(): - space = FakeSpace() - assert encode_utf8(space, u"abc") == "abc" - assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4" - py.test.raises(Hit, encode_utf8, space, u"\ud800") - py.test.raises(Hit, encode_utf8, space, u"\udc00") - # for the following test, go to lengths to avoid CPython's optimizer - # and .pyc file storage, which collapse the two surrogates into one - c = u"\udc00" - py.test.raises(Hit, encode_utf8, space, u"\ud800" + c) - - def test_encode_utf_8_combine_surrogates(): """ In the case of a surrogate pair, the error handler should @@ -52,80 +33,20 @@ that is a valid surrogate pair. """ assert s[start:end] in [u'\udc80', u'\uD800\uDFFF'] - return [], None, end + return '', 0, end - unicode_encode_utf_8( - u, len(u), True, + utf8_encode_utf_8( + u, 'strict', errorhandler=errorhandler, allow_surrogates=False ) -def test_encode_utf8_allow_surrogates(): - sp = FakeSpace() - assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80" - assert encode_utf8(sp, u"\udc00", allow_surrogates=True) == "\xed\xb0\x80" - c = u"\udc00" - got = encode_utf8(sp, u"\ud800" + c, allow_surrogates=True) - assert got == "\xf0\x90\x80\x80" - -def test_encode_utf8sp(): - sp = FakeSpace() - assert encode_utf8sp(sp, u"\ud800") == "\xed\xa0\x80" - assert encode_utf8sp(sp, u"\udc00") == "\xed\xb0\x80" - c = u"\udc00" - got = encode_utf8sp(sp, u"\ud800" + c) - assert got == "\xed\xa0\x80\xed\xb0\x80" - -def test_decode_utf8(): - space = FakeSpace() - assert decode_utf8(space, "abc") == u"abc" - assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234" - py.test.raises(Hit, decode_utf8, space, "\xed\xa0\x80") - py.test.raises(Hit, decode_utf8, space, "\xed\xb0\x80") - py.test.raises(Hit, decode_utf8, space, "\xed\xa0\x80\xed\xb0\x80") - got = decode_utf8(space, "\xf0\x90\x80\x80") - if sys.maxunicode > 65535: - assert map(ord, got) == [0x10000] - else: - assert map(ord, got) == [55296, 56320] - -def test_decode_utf8_allow_surrogates(): - sp = FakeSpace() - assert decode_utf8(sp, "\xed\xa0\x80", allow_surrogates=True) == u"\ud800" - assert decode_utf8(sp, "\xed\xb0\x80", allow_surrogates=True) == u"\udc00" - got = decode_utf8(sp, "\xed\xa0\x80\xed\xb0\x80", allow_surrogates=True) - assert map(ord, got) == [0xd800, 0xdc00] - got = decode_utf8(sp, "\xf0\x90\x80\x80", allow_surrogates=True) - assert map(ord, got) == [0x10000] - def test_decode_utf8sp(): space = FakeSpace() - assert decode_utf8sp(space, "\xed\xa0\x80") == u"\ud800" - assert decode_utf8sp(space, "\xed\xb0\x80") == u"\udc00" + assert decode_utf8sp(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1, 3) + assert decode_utf8sp(space, "\xed\xb0\x80") == ("\xed\xb0\x80", 1, 3) got = decode_utf8sp(space, "\xed\xa0\x80\xed\xb0\x80") - assert map(ord, got) == [0xd800, 0xdc00] + assert map(ord, got[0].decode('utf8')) == [0xd800, 0xdc00] got = decode_utf8sp(space, "\xf0\x90\x80\x80") - assert map(ord, got) == [0x10000] + assert map(ord, got[0].decode('utf8')) == [0x10000] -@pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"]) -def test_utf32_surrogates(unich): - assert (unicode_encode_utf_32_be(unich, 1, None) == - struct.pack('>i', ord(unich))) - with pytest.raises(UnicodeEncodeError): - unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False) - - def replace_with(ru, rs): - def errorhandler(errors, enc, msg, u, startingpos, endingpos): - if errors == 'strict': - raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg) - return ru, rs, endingpos - return unicode_encode_utf_32_be( - u"<%s>" % unich, 3, None, - errorhandler, allow_surrogates=False) - - assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be') - assert (replace_with(None, '\xca\xfe\xca\xfe') == - '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>') - - with pytest.raises(UnicodeDecodeError): - str_decode_utf_32_be(b"\x00\x00\xdc\x80", 4, None) diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -218,20 +218,38 @@ return res.build(), len(s), len(s) def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False): - try: - lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates) - except rutf8.CheckError as e: - # XXX change this to non-recursive - pos = e.pos - assert pos >= 0 - start = s[:pos] - upos = rutf8.codepoints_in_utf8(s, end=pos) - ru, lgt, rettype = errorhandler(errors, 'utf8', - 'surrogates not allowed', s, upos, upos + 1) - end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler, - allow_surrogates=allow_surrogates) - s = start + ru + end - return s + size = len(s) + if size == 0: + return '' + pos = 0 + upos = 0 + result = StringBuilder(size) + while pos < size: + try: + lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates, start=pos) + if pos == 0: + # fast path + return s + for ch in s[pos:]: + result.append(ch) + break + except rutf8.CheckError as e: + for ch in s[pos:e.pos]: + result.append(ch) + upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos) + pos = e.pos + assert pos >= 0 + res, newindex, rettype = errorhandler(errors, 'utf8', + 'surrogates not allowed', s, upos, upos + 1) + if rettype == 'u': + for cp in rutf8.Utf8StringIterator(res): + result.append(chr(cp)) + else: + for ch in res: + result.append(ch) + upos = newindex + pos = rutf8._pos_at_index(s, upos) + return result.build() def utf8_encode_latin_1(s, errors, errorhandler, allow_surrogates=False): try: @@ -1013,49 +1031,6 @@ return result.build() -@specialize.memo() -def _encode_unicode_error_handler(space): - # Fast version of the "strict" errors handler. - # used only in (unused) encode_utf8 - from rpython.rlib import runicode - def raise_unicode_exception_encode(errors, encoding, msg, uni, - startingpos, endingpos): - assert isinstance(uni, unicode) - u_len = len(uni) - utf8 = runicode.unicode_encode_utf8sp(uni, u_len) - raise OperationError(space.w_UnicodeEncodeError, - space.newtuple([space.newtext(encoding), - space.newtext(utf8, u_len), - space.newint(startingpos), - space.newint(endingpos), - space.newtext(msg)])) - return u'', None, 0 - return raise_unicode_exception_encode - - -def encode_utf8(space, uni, allow_surrogates=False): - # Note that Python3 tends to forbid *all* surrogates in utf-8. - # If allow_surrogates=True, then revert to the Python 2 behavior - # which never raises UnicodeEncodeError. Surrogate pairs are then - # allowed, either paired or lone. A paired surrogate is considered - # like the non-BMP character it stands for. See also *_utf8sp(). - xxx - from rpython.rlib import runicode - assert isinstance(uni, unicode) - return runicode.unicode_encode_utf_8( - uni, len(uni), "strict", - errorhandler=_encode_unicode_error_handler(space), - allow_surrogates=allow_surrogates) - -def encode_utf8sp(space, uni, allow_surrogates=True): - xxx - # Surrogate-preserving utf-8 encoding. Any surrogate character - # turns into its 3-bytes encoding, whether it is paired or not. - # This should always be reversible, and the reverse is - # decode_utf8sp(). - from rpython.rlib import runicode - return runicode.unicode_encode_utf8sp(uni, len(uni)) - def decode_utf8sp(space, string): # Surrogate-preserving utf-8 decoding. Assuming there is no # encoding error, it should always be reversible, and the reverse is @@ -1063,7 +1038,6 @@ return str_decode_utf8(string, "string", True, decode_never_raise, allow_surrogates=True) - # ____________________________________________________________ # utf-16 diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -1149,7 +1149,6 @@ backslashreplace = ''.join('\\x%02x' % b for b in ill_surrogate) assert test_sequence.decode(encoding, "backslashreplace") == (before + backslashreplace + after) - def test_lone_surrogates_utf_8(self): """ @@ -1158,6 +1157,8 @@ """ e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8", "surrogateescape").value + assert e.start == 1 + assert e.end == 3 assert e.object[e.start:e.end] == u'\ud800\udfff' def test_charmap_encode(self): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit