Author: fijal Branch: unicode-utf8 Changeset: r90456:777001a4a191 Date: 2017-03-01 16:52 +0100 http://bitbucket.org/pypy/pypy/changeset/777001a4a191/
Log: "fix" multibytecodec diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -3,7 +3,7 @@ from rpython.translator.tool.cbuild import ExternalCompilationInfo from rpython.translator import cdir -UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD' +UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'.encode("utf8") class EncodeDecodeError(Exception): @@ -148,15 +148,17 @@ if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": - replace = u"" + replace = "" + lgt = 0 elif errors == "replace": replace = UNICODE_REPLACEMENT_CHARACTER + lgt = 1 else: assert errorcb - replace, end = errorcb(errors, namecb, reason, + replace, end, lgt = errorcb(errors, namecb, reason, stringdata, start, end) - with rffi.scoped_nonmoving_unicodebuffer(replace) as inbuf: - r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end) + with rffi.scoped_nonmoving_unicodebuffer(replace.decode("utf8")) as inbuf: + r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) if r == MBERR_NOMEMORY: raise MemoryError @@ -255,15 +257,15 @@ replace = "?" else: assert errorcb - retu, rets, end = errorcb(errors, namecb, reason, - unicodedata, start, end) + retu, rets, end, lgt = errorcb(errors, namecb, reason, + unicodedata.encode("utf8"), start, end) if rets is not None: # py3k only replace = rets else: assert retu is not None codec = pypy_cjk_enc_getcodec(encodebuf) - replace = encode(codec, retu, "strict", errorcb, namecb) + replace = encode(codec, retu.decode("utf8"), "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -96,8 +96,9 @@ c_codecs.pypy_cjk_enc_free(self.encodebuf) self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO) - @unwrap_spec(object=unicode, final=bool) - def encode_w(self, object, final=False): + @unwrap_spec(utf8object='utf8', final=bool) + def encode_w(self, utf8object, objlen, final=False): + object = utf8object.decode('utf8') space = self.space state = space.fromcache(CodecState) if len(self.pending) > 0: @@ -107,7 +108,7 @@ state.encode_error_handler, self.name, get_ignore_error(final)) except c_codecs.EncodeDecodeError as e: - raise wrap_unicodeencodeerror(space, e, object, self.name) + raise wrap_unicodeencodeerror(space, e, utf8object, self.name) except RuntimeError: raise wrap_runtimeerror(space) pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf) diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -18,30 +18,30 @@ state = space.fromcache(CodecState) # try: - output = c_codecs.decode(self.codec, input, errors, + u_output = c_codecs.decode(self.codec, input, errors, state.decode_error_handler, self.name) except c_codecs.EncodeDecodeError as e: raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) - return space.newtuple([space.newunicode(output), + return space.newtuple([space.newunicode(u_output), space.newint(len(input))]) - @unwrap_spec(input=unicode, errors="str_or_None") - def encode(self, space, input, errors=None): + @unwrap_spec(input='utf8', errors="str_or_None") + def encode(self, space, input, inputlen, errors=None): if errors is None: errors = 'strict' state = space.fromcache(CodecState) # try: - output = c_codecs.encode(self.codec, input, errors, + output = c_codecs.encode(self.codec, input.decode('utf8'), errors, state.encode_error_handler, self.name) except c_codecs.EncodeDecodeError as e: raise wrap_unicodeencodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) return space.newtuple([space.newbytes(output), - space.newint(len(input))]) + space.newint(inputlen)]) MultibyteCodec.typedef = TypeDef( @@ -76,7 +76,7 @@ space.w_UnicodeEncodeError, space.newtuple([ space.newtext(name), - space.newunicode(input), + space.newutf8(input, -1), space.newint(e.start), space.newint(e.end), space.newtext(e.reason)])) diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -126,6 +126,6 @@ def test_encode_custom_error_handler_bytes(): c = getcodec("hz") def errorhandler(errors, enc, msg, t, startingpos, endingpos): - return None, '\xc3', endingpos + return None, '\xc3', endingpos, -1 s = encode(c, u'abc\u1234def', 'foo', errorhandler) assert '\xc3' in s _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit