Author: Tyler Wade <way...@gmail.com> Branch: utf8-unicode2 Changeset: r72468:e70f582fd5dc Date: 2014-07-17 01:43 -0500 http://bitbucket.org/pypy/pypy/changeset/e70f582fd5dc/
Log: Fix _multibytecodec diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py --- a/pypy/interpreter/utf8.py +++ b/pypy/interpreter/utf8.py @@ -2,9 +2,8 @@ from rpython.rlib.objectmodel import specialize from rpython.rlib.runicode import utf8_code_length from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb -from rpython.rlib.rarithmetic import r_uint -from rpython.rtyper.lltypesystem import rffi -from rpython.rtyper.lltypesystem import lltype +from rpython.rlib.rarithmetic import r_uint, intmask +from rpython.rtyper.lltypesystem import rffi, lltype wchar_rint = rffi.r_uint WCHAR_INTP = rffi.UINTP @@ -464,7 +463,7 @@ if rffi.sizeof(rffi.WCHAR_T) == 2: if 0xD800 <= c <= 0xDBFF: i += 1 - c2 = int(array[i]) + c2 = intmask(array[i]) if c2 == 0: builder.append(c) break @@ -485,7 +484,7 @@ builder = Utf8Builder() i = 0; while i < size: - c = int(array[i]) + c = intmask(array[i]) if c == 0: break @@ -513,7 +512,7 @@ builder = Utf8Builder() i = 0; while i < size: - c = int(array[i]) + c = intmask(array[i]) if rffi.sizeof(rffi.WCHAR_T) == 2: if i != size - 1 and 0xD800 <= c <= 0xDBFF: diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -1,8 +1,9 @@ import py from rpython.rtyper.lltypesystem import lltype, rffi from rpython.translator.tool.cbuild import ExternalCompilationInfo +from pypy.interpreter.utf8 import Utf8Str -UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD' +UNICODE_REPLACEMENT_CHARACTER = Utf8Str.from_unicode(u'\uFFFD') class EncodeDecodeError(Exception): @@ -139,7 +140,7 @@ errorcb, namecb, stringdata) src = pypy_cjk_dec_outbuf(decodebuf) length = pypy_cjk_dec_outlen(decodebuf) - return rffi.wcharpsize2unicode(src, length) + return Utf8Str.from_wcharpsize(src, length) # finally: rffi.free_nonmovingbuffer(stringdata, inbuf) @@ -164,18 +165,18 @@ if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": - replace = u"" + replace = Utf8Str("") elif errors == "replace": replace = UNICODE_REPLACEMENT_CHARACTER else: assert errorcb replace, end = errorcb(errors, namecb, reason, stringdata, start, end) - inbuf = rffi.get_nonmoving_unicodebuffer(replace) + inbuf = replace.copy_to_wcharp() try: r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end) finally: - rffi.free_nonmoving_unicodebuffer(replace, inbuf) + rffi.free_wcharp(inbuf) if r == MBERR_NOMEMORY: raise MemoryError @@ -222,7 +223,7 @@ def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None, namecb=None, ignore_error=0): inleft = len(unicodedata) - inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata) + inbuf = unicodedata.copy_to_wcharp() try: if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0: raise MemoryError @@ -247,7 +248,7 @@ return rffi.charpsize2str(src, length) # finally: - rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf) + rffi.free_wcharp(inbuf) def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb, unicodedata): @@ -273,7 +274,7 @@ elif errors == "replace": codec = pypy_cjk_enc_getcodec(encodebuf) try: - replace = encode(codec, u"?") + replace = encode(codec, Utf8Str("?")) except EncodeDecodeError: replace = "?" else: diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -1,4 +1,5 @@ import py +from pypy.interpreter.utf8 import Utf8Str from pypy.module._multibytecodec.c_codecs import getcodec, codecs from pypy.module._multibytecodec.c_codecs import decode, encode from pypy.module._multibytecodec.c_codecs import EncodeDecodeError @@ -95,37 +96,38 @@ def test_encode_hz(): c = getcodec("hz") - s = encode(c, u'foobar') + s = encode(c, Utf8Str('foobar')) assert s == 'foobar' and type(s) is str - s = encode(c, u'\u5f95\u6cef') + s = encode(c, Utf8Str.from_unicode(u'\u5f95\u6cef')) assert s == '~{abc}~}' def test_encode_hz_error(): # error c = getcodec("hz") - e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value + e = py.test.raises(EncodeDecodeError, encode, c, + Utf8Str.from_unicode(u'abc\u1234def')).value assert e.start == 3 assert e.end == 4 assert e.reason == "illegal multibyte sequence" def test_encode_hz_ignore(): c = getcodec("hz") - s = encode(c, u'abc\u1234def', 'ignore') + s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'ignore') assert s == 'abcdef' def test_encode_hz_replace(): c = getcodec("hz") - s = encode(c, u'abc\u1234def', 'replace') + s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'replace') assert s == 'abc?def' def test_encode_jisx0208(): c = getcodec('iso2022_jp') - s = encode(c, u'\u83ca\u5730\u6642\u592b') + s = encode(c, Utf8Str.from_unicode(u'\u83ca\u5730\u6642\u592b')) assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str def test_encode_custom_error_handler_bytes(): c = getcodec("hz") def errorhandler(errors, enc, msg, t, startingpos, endingpos): return None, '\xc3', endingpos - s = encode(c, u'abc\u1234def', 'foo', errorhandler) + s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'foo', errorhandler) assert '\xc3' in s diff --git a/pypy/module/sys/vm.py b/pypy/module/sys/vm.py --- a/pypy/module/sys/vm.py +++ b/pypy/module/sys/vm.py @@ -3,11 +3,11 @@ """ from rpython.rlib import jit -from rpython.rlib.runicode import MAXUNICODE from pypy.interpreter import gateway from pypy.interpreter.error import OperationError from pypy.interpreter.gateway import unwrap_spec +from pypy.interpreter.utf8_codecs import MAXUNICODE # ____________________________________________________________ _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit