[pypy-commit] pypy unicode-utf8: fix multibytecodec

fijal Thu, 23 Nov 2017 09:31:33 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93146:99ca8cf9bbc4
Date: 2017-11-23 18:30 +0100
http://bitbucket.org/pypy/pypy/changeset/99ca8cf9bbc4/


Log:    fix multibytecodec

diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -197,19 +197,21 @@
 MBENC_FLUSH = 1
 MBENC_RESET = 2
 
-def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+def encode(codec, unicodedata, length, errors="strict", errorcb=None,
+           namecb=None):
     encodebuf = pypy_cjk_enc_new(codec)
     if not encodebuf:
         raise MemoryError
     try:
-        return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+        return encodeex(encodebuf, unicodedata, length, errors, errorcb, 
namecb)
     finally:
         pypy_cjk_enc_free(encodebuf)
 
-def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
              namecb=None, ignore_error=0):
-    inleft = len(unicodedata)
-    with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
+    inleft = length
+    inbuf = rffi.utf82wcharp(utf8data, length)
+    try:
         if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
             raise MemoryError
         if ignore_error == 0:
@@ -221,16 +223,18 @@
             if r == 0 or r == ignore_error:
                 break
             multibytecodec_encerror(encodebuf, r, errors,
-                                    errorcb, namecb, unicodedata)
+                                    errorcb, namecb, utf8data)
         while flags & MBENC_RESET:
             r = pypy_cjk_enc_reset(encodebuf)
             if r == 0:
                 break
             multibytecodec_encerror(encodebuf, r, errors,
-                                    errorcb, namecb, unicodedata)
+                                    errorcb, namecb, utf8data)
         src = pypy_cjk_enc_outbuf(encodebuf)
         length = pypy_cjk_enc_outlen(encodebuf)
         return rffi.charpsize2str(src, length)
+    finally:
+        lltype.free(inbuf, flavor='raw')
 
 def multibytecodec_encerror(encodebuf, e, errors,
                             errorcb, namecb, unicodedata):
@@ -256,21 +260,16 @@
     elif errors == "replace":
         codec = pypy_cjk_enc_getcodec(encodebuf)
         try:
-            replace = encode(codec, u"?")
+            replace = encode(codec, "?", 1)
         except EncodeDecodeError:
             replace = "?"
     else:
         assert errorcb
-        XXX
-        retu, rets, end = errorcb(errors, namecb, reason,
-                                  unicodedata.encode("utf8"), start, end)
-        if rets is not None:
-            # py3k only
-            replace = rets
-        else:
-            assert retu is not None
-            codec = pypy_cjk_enc_getcodec(encodebuf)
-            replace = encode(codec, retu, "strict", errorcb, namecb)
+        rets, end = errorcb(errors, namecb, reason,
+                            unicodedata, start, end)
+        codec = pypy_cjk_enc_getcodec(encodebuf)
+        lgt, _ = rutf8.get_utf8_length_flag(rets)
+        replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
     with rffi.scoped_nonmovingbuffer(replace) as inbuf:
         r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
     if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py 
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,4 +1,5 @@
 from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib import rutf8
 from pypy.module._multibytecodec import c_codecs
 from pypy.module._multibytecodec.interp_multibytecodec import (
     MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
@@ -65,7 +66,8 @@
         pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
         assert 0 <= pos <= len(object)
         self.pending = object[pos:]
-        return space.newunicode(output)
+        lgt, flag = rutf8.get_utf8_length_flag(output)
+        return space.newutf8(output, lgt, flag)
 
 
 @unwrap_spec(errors="text_or_none")
@@ -88,7 +90,8 @@
 
     def _initialize(self):
         self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
-        self.pending = u""
+        self.pending = ""
+        self.pending_len = 0
 
     def _free(self):
         self.pending = None
@@ -96,25 +99,37 @@
             c_codecs.pypy_cjk_enc_free(self.encodebuf)
             self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
 
-    @unwrap_spec(object='utf8', final=bool)
-    def encode_w(self, object, final=False):
-        u_object = object.decode('utf8')
+    @unwrap_spec(final=bool)
+    def encode_w(self, space, w_object, final=False):
+        utf8data, length = space.utf8_len_w(w_object)
         space = self.space
         state = space.fromcache(CodecState)
         if len(self.pending) > 0:
-            u_object = self.pending + u_object
+            utf8data = self.pending + utf8data
+            length += self.pending_len
         try:
-            output = c_codecs.encodeex(self.encodebuf, u_object, self.errors,
+            output = c_codecs.encodeex(self.encodebuf, utf8data, length,
+                                       self.errors,
                                        state.encode_error_handler, self.name,
                                        get_ignore_error(final))
         except c_codecs.EncodeDecodeError as e:
-            raise wrap_unicodeencodeerror(space, e, object, len(u_object),
+            raise wrap_unicodeencodeerror(space, e, utf8data, length,
                                           self.name)
         except RuntimeError:
             raise wrap_runtimeerror(space)
         pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
-        assert 0 <= pos <= len(u_object)
-        self.pending = u_object[pos:]
+        assert 0 <= pos <= length
+        # scan the utf8 string until we hit pos
+        i = 0
+        stop = length - pos
+        self.pending_len = stop
+        if stop > 0:
+            while pos > 0:
+                i = rutf8.next_codepoint_pos(utf8data, i)
+                pos -= 1
+            self.pending = utf8data[i:]
+        else:
+            self.pending = ""
         return space.newbytes(output)
 
 
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py 
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -31,23 +31,23 @@
         return space.newtuple([space.newutf8(utf8_output, lgt, flag),
                                space.newint(len(input))])
 
-    @unwrap_spec(input='utf8', errors="text_or_none")
-    def encode(self, space, input, errors=None):
+    @unwrap_spec(errors="text_or_none")
+    def encode(self, space, w_input, errors=None):
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
+        input, length = space.utf8_len_w(w_input)
         #
-        u_input = input.decode('utf8')
         try:
-            output = c_codecs.encode(self.codec, u_input, errors,
+            output = c_codecs.encode(self.codec, input, length, errors,
                                      state.encode_error_handler, self.name)
         except c_codecs.EncodeDecodeError as e:
-            raise wrap_unicodeencodeerror(space, e, input, len(u_input),
+            raise wrap_unicodeencodeerror(space, e, input, length,
                                           self.name)
         except RuntimeError:
             raise wrap_runtimeerror(space)
         return space.newtuple([space.newbytes(output),
-                               space.newint(len(u_input))])
+                               space.newint(length)])
 
 
 MultibyteCodec.typedef = TypeDef(
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py 
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -14,27 +14,27 @@
 def test_decode_gbk():
     c = getcodec("gbk")
     u = decode(c, "\xA1\xAA")
-    assert u == unichr(0x2014)
+    assert u == unichr(0x2014).encode('utf8')
     u = decode(c, "foobar")
-    assert u == u"foobar"
+    assert u == "foobar"
 
 def test_decode_hz():
     # stateful
     c = getcodec("hz")
     u = decode(c, "~{abc}")
-    assert u == u'\u5f95\u6cef'
+    assert u == u'\u5f95\u6cef'.encode('utf8')
     u = decode(c, "~{")
-    assert u == u''
+    assert u == ''
 
 def test_decodeex_hz():
     c = getcodec("hz")
     decodebuf = c_codecs.pypy_cjk_dec_new(c)
     u = c_codecs.decodeex(decodebuf, "~{abcd~}")
-    assert u == u'\u5f95\u6c85'
+    assert u == u'\u5f95\u6c85'.encode('utf8')
     u = c_codecs.decodeex(decodebuf, "~{efgh~}")
-    assert u == u'\u5f50\u73b7'
+    assert u == u'\u5f50\u73b7'.encode('utf8')
     u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh")
-    assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'
+    assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'.encode('utf8')
     c_codecs.pypy_cjk_dec_free(decodebuf)
 
 def test_decodeex_hz_incomplete():
@@ -64,7 +64,7 @@
         buf += c
         u = c_codecs.decodeex(decodebuf, buf,
                               ignore_error = c_codecs.MBERR_TOOFEW)
-        assert u == output
+        assert u == output.encode('utf8')
         incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf)
         buf = buf[incompletepos:]
     assert buf == ''
@@ -86,46 +86,47 @@
 def test_decode_hz_ignore():
     c = getcodec("hz")
     u = decode(c, 'def~{}abc', 'ignore')
-    assert u == u'def\u5fcf'
+    assert u == u'def\u5fcf'.encode('utf8')
 
 def test_decode_hz_replace():
     c = getcodec("hz")
     u = decode(c, 'def~{}abc', 'replace')
-    assert u == u'def\ufffd\u5fcf'
+    assert u == u'def\ufffd\u5fcf'.encode('utf8')
 
 def test_encode_hz():
     c = getcodec("hz")
-    s = encode(c, u'foobar')
+    s = encode(c, u'foobar'.encode('utf8'), 6)
     assert s == 'foobar' and type(s) is str
-    s = encode(c, u'\u5f95\u6cef')
+    s = encode(c, u'\u5f95\u6cef'.encode('utf8'), 2)
     assert s == '~{abc}~}'
 
 def test_encode_hz_error():
     # error
     c = getcodec("hz")
-    e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+    e = py.test.raises(EncodeDecodeError, encode, c, 
u'abc\u1234def'.encode('utf8'), 7).value
     assert e.start == 3
     assert e.end == 4
     assert e.reason == "illegal multibyte sequence"
 
 def test_encode_hz_ignore():
     c = getcodec("hz")
-    s = encode(c, u'abc\u1234def', 'ignore')
+    s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'ignore')
     assert s == 'abcdef'
 
 def test_encode_hz_replace():
     c = getcodec("hz")
-    s = encode(c, u'abc\u1234def', 'replace')
+    s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'replace')
     assert s == 'abc?def'
 
 def test_encode_jisx0208():
     c = getcodec('iso2022_jp')
-    s = encode(c, u'\u83ca\u5730\u6642\u592b')
+    s = encode(c, u'\u83ca\u5730\u6642\u592b'.encode('utf8'), 4)
     assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str
 
 def test_encode_custom_error_handler_bytes():
+    py.test.skip("needs revamping in py3k")
     c = getcodec("hz")
     def errorhandler(errors, enc, msg, t, startingpos, endingpos):
-        return None, '\xc3', endingpos
-    s = encode(c, u'abc\u1234def', 'foo', errorhandler)
+        return u'\xc3'.encode('utf8'), endingpos
+    s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'foo', errorhandler)
     assert '\xc3' in s
diff --git a/pypy/module/_multibytecodec/test/test_translation.py 
b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -1,6 +1,7 @@
 from pypy.module._multibytecodec import c_codecs
 from rpython.translator.c.test import test_standalone
 from rpython.config.translationoption import get_combined_translation_config
+from rpython.rlib import rutf8
 
 
 class TestTranslation(test_standalone.StandaloneTests):
@@ -13,7 +14,8 @@
             codecname, string = argv[1], argv[2]
             c = c_codecs.getcodec(codecname)
             u = c_codecs.decode(c, string)
-            r = c_codecs.encode(c, u)
+            lgt, _ = rutf8.get_utf8_length_flag(u)
+            r = c_codecs.encode(c, u, lgt)
             print r
             return 0
         #
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: fix multibytecodec

Reply via email to