Author: Matti Picus <matti.pi...@gmail.com>
Branch: py3.6
Changeset: r96063:3f907d46d82c
Date: 2019-02-18 16:12 +0200
http://bitbucket.org/pypy/pypy/changeset/3f907d46d82c/

Log:    merge heads

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,13 +1,6 @@
-import py
-import pytest
-import struct
-import sys
 from pypy.interpreter.unicodehelper import (
-    encode_utf8, decode_utf8,
-    unicode_encode_utf_8,
-    unicode_encode_utf_32_be, str_decode_utf_32_be
+    utf8_encode_utf_8, decode_utf8sp,
 )
-from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
 class Hit(Exception):
@@ -20,18 +13,6 @@
         raise AttributeError(name)
 
 
-def test_encode_utf8():
-    space = FakeSpace()
-    assert encode_utf8(space, u"abc") == "abc"
-    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
-    py.test.raises(Hit, encode_utf8, space, u"\ud800")
-    py.test.raises(Hit, encode_utf8, space, u"\udc00")
-    # for the following test, go to lengths to avoid CPython's optimizer
-    # and .pyc file storage, which collapse the two surrogates into one
-    c = u"\udc00"
-    py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
-
-
 def test_encode_utf_8_combine_surrogates():
     """
     In the case of a surrogate pair, the error handler should
@@ -52,80 +33,20 @@
            that is a valid surrogate pair.
         """
         assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
-        return [], None, end
+        return '', 0, end
 
-    unicode_encode_utf_8(
-        u, len(u), True,
+    utf8_encode_utf_8(
+        u, 'strict',
         errorhandler=errorhandler,
         allow_surrogates=False
     )
 
-def test_encode_utf8_allow_surrogates():
-    sp = FakeSpace()
-    assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
-    assert encode_utf8(sp, u"\udc00", allow_surrogates=True) == "\xed\xb0\x80"
-    c = u"\udc00"
-    got = encode_utf8(sp, u"\ud800" + c, allow_surrogates=True)
-    assert got == "\xf0\x90\x80\x80"
-
-def test_encode_utf8sp():
-    sp = FakeSpace()
-    assert encode_utf8sp(sp, u"\ud800") == "\xed\xa0\x80"
-    assert encode_utf8sp(sp, u"\udc00") == "\xed\xb0\x80"
-    c = u"\udc00"
-    got = encode_utf8sp(sp, u"\ud800" + c)
-    assert got == "\xed\xa0\x80\xed\xb0\x80"
-
-def test_decode_utf8():
-    space = FakeSpace()
-    assert decode_utf8(space, "abc") == u"abc"
-    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
-    py.test.raises(Hit, decode_utf8, space, "\xed\xa0\x80")
-    py.test.raises(Hit, decode_utf8, space, "\xed\xb0\x80")
-    py.test.raises(Hit, decode_utf8, space, "\xed\xa0\x80\xed\xb0\x80")
-    got = decode_utf8(space, "\xf0\x90\x80\x80")
-    if sys.maxunicode > 65535:
-        assert map(ord, got) == [0x10000]
-    else:
-        assert map(ord, got) == [55296, 56320]
-
-def test_decode_utf8_allow_surrogates():
-    sp = FakeSpace()
-    assert decode_utf8(sp, "\xed\xa0\x80", allow_surrogates=True) == u"\ud800"
-    assert decode_utf8(sp, "\xed\xb0\x80", allow_surrogates=True) == u"\udc00"
-    got = decode_utf8(sp, "\xed\xa0\x80\xed\xb0\x80", allow_surrogates=True)
-    assert map(ord, got) == [0xd800, 0xdc00]
-    got = decode_utf8(sp, "\xf0\x90\x80\x80", allow_surrogates=True)
-    assert map(ord, got) == [0x10000]
-
 def test_decode_utf8sp():
     space = FakeSpace()
-    assert decode_utf8sp(space, "\xed\xa0\x80") == u"\ud800"
-    assert decode_utf8sp(space, "\xed\xb0\x80") == u"\udc00"
+    assert decode_utf8sp(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1, 3)
+    assert decode_utf8sp(space, "\xed\xb0\x80") == ("\xed\xb0\x80", 1, 3)
     got = decode_utf8sp(space, "\xed\xa0\x80\xed\xb0\x80")
-    assert map(ord, got) == [0xd800, 0xdc00]
+    assert map(ord, got[0].decode('utf8')) == [0xd800, 0xdc00]
     got = decode_utf8sp(space, "\xf0\x90\x80\x80")
-    assert map(ord, got) == [0x10000]
+    assert map(ord, got[0].decode('utf8')) == [0x10000]
 
-@pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"])
-def test_utf32_surrogates(unich):
-    assert (unicode_encode_utf_32_be(unich, 1, None) ==
-            struct.pack('>i', ord(unich)))
-    with pytest.raises(UnicodeEncodeError):
-        unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False)
-
-    def replace_with(ru, rs):
-        def errorhandler(errors, enc, msg, u, startingpos, endingpos):
-            if errors == 'strict':
-                raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg)
-            return ru, rs, endingpos
-        return unicode_encode_utf_32_be(
-            u"<%s>" % unich, 3, None,
-            errorhandler, allow_surrogates=False)
-
-    assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
-    assert (replace_with(None, '\xca\xfe\xca\xfe') ==
-            '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
-
-    with pytest.raises(UnicodeDecodeError):
-        str_decode_utf_32_be(b"\x00\x00\xdc\x80", 4, None)
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -218,20 +218,38 @@
     return res.build(), len(s), len(s)
 
 def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False):
-    try:
-        lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates)
-    except rutf8.CheckError as e:
-        # XXX change this to non-recursive
-        pos = e.pos
-        assert pos >= 0
-        start = s[:pos]
-        upos = rutf8.codepoints_in_utf8(s, end=pos)
-        ru, lgt, rettype = errorhandler(errors, 'utf8',
-                    'surrogates not allowed', s, upos, upos + 1)
-        end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
-                                allow_surrogates=allow_surrogates)
-        s = start + ru + end
-    return s
+    size = len(s)
+    if size == 0:
+        return ''
+    pos = 0
+    upos = 0
+    result = StringBuilder(size)
+    while pos < size:
+        try:
+            lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates, 
start=pos)
+            if pos == 0:
+                # fast path
+                return s
+            for ch in s[pos:]:
+                result.append(ch)
+            break
+        except rutf8.CheckError as e:
+            for ch in s[pos:e.pos]:
+                result.append(ch)
+            upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos)
+            pos = e.pos
+            assert pos >= 0
+            res, newindex, rettype = errorhandler(errors, 'utf8',
+                        'surrogates not allowed', s, upos, upos + 1)
+            if rettype == 'u':
+                for cp in rutf8.Utf8StringIterator(res):
+                    result.append(chr(cp))
+            else:
+                for ch in res:
+                    result.append(ch)
+            upos = newindex
+            pos = rutf8._pos_at_index(s, upos)
+    return result.build()
 
 def utf8_encode_latin_1(s, errors, errorhandler, allow_surrogates=False):
     try:
@@ -1013,49 +1031,6 @@
 
     return result.build()
 
-@specialize.memo()
-def _encode_unicode_error_handler(space):
-    # Fast version of the "strict" errors handler.
-    # used only in (unused) encode_utf8
-    from rpython.rlib import runicode
-    def raise_unicode_exception_encode(errors, encoding, msg, uni,
-                                       startingpos, endingpos):
-        assert isinstance(uni, unicode)
-        u_len = len(uni)
-        utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
-        raise OperationError(space.w_UnicodeEncodeError,
-                             space.newtuple([space.newtext(encoding),
-                                             space.newtext(utf8, u_len),
-                                             space.newint(startingpos),
-                                             space.newint(endingpos),
-                                             space.newtext(msg)]))
-        return u'', None, 0
-    return raise_unicode_exception_encode
-
-
-def encode_utf8(space, uni, allow_surrogates=False):
-    # Note that Python3 tends to forbid *all* surrogates in utf-8.
-    # If allow_surrogates=True, then revert to the Python 2 behavior
-    # which never raises UnicodeEncodeError.  Surrogate pairs are then
-    # allowed, either paired or lone.  A paired surrogate is considered
-    # like the non-BMP character it stands for.  See also *_utf8sp().
-    xxx
-    from rpython.rlib import runicode
-    assert isinstance(uni, unicode)
-    return runicode.unicode_encode_utf_8(
-        uni, len(uni), "strict",
-        errorhandler=_encode_unicode_error_handler(space),
-        allow_surrogates=allow_surrogates)
-
-def encode_utf8sp(space, uni, allow_surrogates=True):
-    xxx
-    # Surrogate-preserving utf-8 encoding.  Any surrogate character
-    # turns into its 3-bytes encoding, whether it is paired or not.
-    # This should always be reversible, and the reverse is
-    # decode_utf8sp().
-    from rpython.rlib import runicode
-    return runicode.unicode_encode_utf8sp(uni, len(uni))
-
 def decode_utf8sp(space, string):
     # Surrogate-preserving utf-8 decoding.  Assuming there is no
     # encoding error, it should always be reversible, and the reverse is
@@ -1063,7 +1038,6 @@
     return str_decode_utf8(string, "string", True, decode_never_raise,
                            allow_surrogates=True)
 
-
 # ____________________________________________________________
 # utf-16
 
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -1149,7 +1149,6 @@
                 backslashreplace = ''.join('\\x%02x' % b for b in 
ill_surrogate)
                 assert test_sequence.decode(encoding, "backslashreplace") == 
(before +
                                                              backslashreplace 
+ after)
-                
 
     def test_lone_surrogates_utf_8(self):
         """
@@ -1158,6 +1157,8 @@
         """
         e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8",
                    "surrogateescape").value
+        assert e.start == 1
+        assert e.end == 3
         assert e.object[e.start:e.end] == u'\ud800\udfff'
 
     def test_charmap_encode(self):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to