Author: Tyler Wade <way...@gmail.com> Branch: utf8-unicode2 Changeset: r72447:cc1160f9014e Date: 2014-07-13 07:34 -0500 http://bitbucket.org/pypy/pypy/changeset/cc1160f9014e/
Log: Fix _rawffi module diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py --- a/pypy/interpreter/test/test_utf8.py +++ b/pypy/interpreter/test/test_utf8.py @@ -51,7 +51,7 @@ if sys.maxunicode < 65536: assert l[:3] == [u'A', u'\u010F', u'\u20AC'] else: - assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D'] + assert l == [u'A', u'\u010F', u'\u20AC', u'\U0001F63D'] def test_reverse_iterator(): s = build_utf8str() @@ -197,7 +197,7 @@ def test_copy_to_wcharp(): s = build_utf8str() - if sys.maxunicode < 0x10000: + if sys.maxunicode < 0x10000 and rffi.sizeof(rffi.WCHAR_T) == 4: # The last character requires a surrogate pair on narrow builds and # so won't be converted correctly by rffi.wcharp2unicode s = s[:-1] @@ -206,3 +206,27 @@ u = rffi.wcharp2unicode(wcharp) rffi.free_wcharp(wcharp) assert s == u + +def test_from_wcharp(): + def check(u): + wcharp = rffi.unicode2wcharp(u) + s = Utf8Str.from_wcharp(wcharp) + rffi.free_wcharp(wcharp) + assert s == u + check(u'A\u010F\u20AC\U0001F63D') + check(u'0xDCC0 ') + check(u'0xDCC0') + +def test_from_wcharpn(): + u = u'A\u010F\u20AC\U0001F63D' + wcharp = rffi.unicode2wcharp(u) + s = Utf8Str.from_wcharpn(wcharp, 3) + assert s == u[:3] + + s = Utf8Str.from_wcharpn(wcharp, 4) + if sys.maxunicode == 0xFFFF: + assert s == u[:4] + else: + assert s == u + + rffi.free_wcharp(wcharp) diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py --- a/pypy/interpreter/utf8.py +++ b/pypy/interpreter/utf8.py @@ -4,6 +4,14 @@ from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb from rpython.rlib.rarithmetic import r_uint from rpython.rtyper.lltypesystem import rffi +from rpython.rtyper.lltypesystem import lltype + +wchar_rint = rffi.r_uint +WCHAR_INTP = rffi.UINTP +if rffi.sizeof(rffi.WCHAR_T) == 2: + wchar_rint = rffi.r_ushort + WCHAR_INTP = rffi.USHORTP + def utf8chr(value): # Like unichr, but returns a Utf8Str object @@ -415,15 +423,89 @@ byte_pos -= 1 return byte_pos - def copy_to_wcharp(self): - # XXX Temporary solution. This won't work on correctly on systems - # where sizeof(wchar_t) == 2. Also, it copies twice. - from pypy.interpreter.utf8_codecs import unicode_encode_unicode_internal - from rpython.rlib.runicode import MAXUNICODE - bytes = unicode_encode_unicode_internal(self, len(self), 'strict') - return rffi.cast(rffi.CWCHARP, rffi.str2charp(bytes)) + def copy_to_wcharp(self, track_allocation=True): + length = len(self) + 1 + if rffi.sizeof(rffi.WCHAR_T) == 2: + for c in self.codepoint_iter(): + if c > 0xFFFF: + length += 1 + array = lltype.malloc(WCHAR_INTP.TO, length, flavor='raw', + track_allocation=track_allocation) + from pypy.interpreter.utf8_codecs import create_surrogate_pair + i = 0; + for c in self.codepoint_iter(): + if rffi.sizeof(rffi.WCHAR_T) == 2: + c1, c2 = create_surrogate_pair(c) + array[i] = wchar_rint(c1) + if c2: + i += 1 + array[i] = wchar_rint(c2) + else: + array[i] = wchar_rint(c) + + i += 1 + + array[i] = wchar_rint(0) + array = rffi.cast(rffi.CWCHARP, array) + return array + + @staticmethod + def from_wcharp(wcharp): + array = rffi.cast(WCHAR_INTP, wcharp) + builder = Utf8Builder() + i = 0; + while True: + c = int(array[i]) + if c == 0: + break + + if rffi.sizeof(rffi.WCHAR_T) == 2: + if 0xD800 <= c <= 0xDBFF: + i += 1 + c2 = int(array[i]) + if c2 == 0: + builder.append(c) + break + elif not (0xDC00 <= c2 <= 0xDFFF): + builder.append(c) + c = c2 + else: + c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000; + + builder.append(c) + i += 1 + + return builder.build() + + @staticmethod + def from_wcharpn(wcharp, size): + array = rffi.cast(WCHAR_INTP, wcharp) + builder = Utf8Builder() + i = 0; + while i < size: + c = int(array[i]) + if c == 0: + break + + if rffi.sizeof(rffi.WCHAR_T) == 2: + if i != size - 1 and 0xD800 <= c <= 0xDBFF: + i += 1 + c2 = int(array[i]) + if c2 == 0: + builder.append(c) + break + elif not (0xDC00 <= c2 <= 0xDFFF): + builder.append(c) + c = c2 + else: + c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000; + + builder.append(c) + i += 1 + + return builder.build() class Utf8Builder(object): @specialize.argtype(1) diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py --- a/pypy/interpreter/utf8_codecs.py +++ b/pypy/interpreter/utf8_codecs.py @@ -784,6 +784,13 @@ result.append(r) return result.build(), pos, bo +def create_surrogate_pair(val): + if val >= 0x10000: + return (0xD800 | ((val-0x10000) >> 10), + 0xDC00 | ((val-0x10000) & 0x3FF)) + else: + return val, 0 + def unicode_encode_utf_16_helper(s, size, errors, errorhandler=None, byteorder='little'): @@ -803,10 +810,7 @@ while i < size: ch = utf8ord(s, i) i += 1 - ch2 = 0 - if ch >= 0x10000: - ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF) - ch = 0xD800 | ((ch-0x10000) >> 10) + ch, ch2 = create_surrogate_pair(ch) _STORECHAR(result, ch, byteorder) if ch2: diff --git a/pypy/module/_rawffi/alt/interp_funcptr.py b/pypy/module/_rawffi/alt/interp_funcptr.py --- a/pypy/module/_rawffi/alt/interp_funcptr.py +++ b/pypy/module/_rawffi/alt/interp_funcptr.py @@ -168,7 +168,7 @@ self.argchain.arg(addr) def handle_unichar_p(self, w_ffitype, w_obj, unicodeval): - buf = rffi.unicode2wcharp(unicodeval) + buf = unicodeval.copy_to_wcharp() self.w_func.to_free.append(rffi.cast(rffi.VOIDP, buf)) addr = rffi.cast(rffi.ULONG, buf) self.argchain.arg(addr) diff --git a/pypy/module/_rawffi/alt/test/test_type_converter.py b/pypy/module/_rawffi/alt/test/test_type_converter.py --- a/pypy/module/_rawffi/alt/test/test_type_converter.py +++ b/pypy/module/_rawffi/alt/test/test_type_converter.py @@ -1,6 +1,7 @@ import sys from rpython.rlib.rarithmetic import r_uint, r_singlefloat, r_longlong, r_ulonglong from rpython.rlib.libffi import IS_32_BIT +from pypy.interpreter.utf8 import Utf8Str from pypy.module._rawffi.alt.interp_ffitype import app_types, descr_new_pointer from pypy.module._rawffi.alt.type_converter import FromAppLevelConverter, ToAppLevelConverter @@ -58,7 +59,8 @@ def test_char(self): space = self.space self.check(app_types.char, space.wrap('a'), ord('a')) - self.check(app_types.unichar, space.wrap(u'\u1234'), 0x1234) + self.check(app_types.unichar, + space.wrap(Utf8Str.from_unicode(u'\u1234')), 0x1234) def test_signed_longlong(self): space = self.space @@ -120,8 +122,11 @@ def test_strings(self): # first, try automatic conversion from applevel self.check(app_types.char_p, self.space.wrap('foo'), 'foo') - self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234') - self.check(app_types.unichar_p, self.space.wrap('foo'), u'foo') + self.check(app_types.unichar_p, + self.space.wrap(Utf8Str.from_unicode(u'foo\u1234')), + Utf8Str.from_unicode(u'foo\u1234')) + self.check(app_types.unichar_p, self.space.wrap('foo'), + Utf8Str.from_unicode(u'foo')) # then, try to pass explicit pointers self.check(app_types.char_p, self.space.wrap(42), 42) self.check(app_types.unichar_p, self.space.wrap(42), 42) diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -2,6 +2,7 @@ from rpython.rlib import jit from rpython.rlib.rarithmetic import r_uint from pypy.interpreter.error import OperationError, oefmt +from pypy.interpreter.utf8 import utf8chr from pypy.module._rawffi.structure import W_StructureInstance, W_Structure from pypy.module._rawffi.alt.interp_ffitype import app_types @@ -228,7 +229,7 @@ return space.wrap(chr(ucharval)) elif w_ffitype.is_unichar(): wcharval = self.get_unichar(w_ffitype) - return space.wrap(unichr(wcharval)) + return space.wrap(utf8chr(int(wcharval))) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): diff --git a/pypy/module/_rawffi/array.py b/pypy/module/_rawffi/array.py --- a/pypy/module/_rawffi/array.py +++ b/pypy/module/_rawffi/array.py @@ -42,14 +42,27 @@ if not space.is_none(w_items): items_w = space.unpackiterable(w_items) iterlength = len(items_w) - if iterlength > length: + + double_length_items = 0 + if rffi.sizeof(rffi.WCHAR_T) == 2: + # On systems where sizeof(wchar_t) = 2, the resulting array + # needs to be encoded in utf-16. As a result, codepoints larger + # than 0xFFFF will occupy two array values + for w_i in items_w: + if space.isinstance_w(w_i, space.w_unicode): + u = space.unicode_w(w_i) + if len(u) == 0 and utf8ord(u) > 0xFFFF: + double_length_items += 1 + + if iterlength + double_length_items > length: raise OperationError(space.w_ValueError, space.wrap("too many items for specified" " array length")) - for num in range(iterlength): - w_item = items_w[num] - unwrap_value(space, write_ptr, result.ll_buffer, num, - self.itemcode, w_item) + i = 0 + for w_item in items_w: + i += unwrap_value(space, write_ptr, result.ll_buffer, i, + self.itemcode, w_item) + return space.wrap(result) def descr_repr(self, space): diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -2,6 +2,9 @@ from pypy.interpreter.error import OperationError, oefmt, wrap_oserror from pypy.interpreter.gateway import interp2app, unwrap_spec from pypy.interpreter.typedef import TypeDef, GetSetProperty +from pypy.interpreter.utf8 import ( + Utf8Str, utf8ord, utf8chr, WCHAR_INTP, wchar_rint) +from pypy.interpreter.utf8_codecs import create_surrogate_pair from rpython.rlib.clibffi import * from rpython.rtyper.lltypesystem import lltype, rffi @@ -85,6 +88,7 @@ LL_TYPEMAP['X'] = rffi.CCHARP LL_TYPEMAP['v'] = rffi.SHORT + def letter2tp(space, key): from pypy.module._rawffi.array import PRIMITIVE_ARRAY_TYPES try: @@ -269,6 +273,8 @@ ptr_val = t_array[0] return ptr_val else: + if T is rffi.CWCHARP: + return utf8chr(int(rffi.cast(WCHAR_INTP, ptr)[ofs])) return rffi.cast(T, ptr)[ofs] read_ptr._annspecialcase_ = 'specialize:arg(2)' @@ -382,14 +388,18 @@ else: ptr = unwrap_truncate_int(rffi.VOIDP, space, w_arg) push_func(add_arg, argdesc, ptr) + return 1 elif letter == "d": push_func(add_arg, argdesc, space.float_w(w_arg)) + return 1 elif letter == "f": push_func(add_arg, argdesc, rffi.cast(rffi.FLOAT, space.float_w(w_arg))) + return 1 elif letter == "g": push_func(add_arg, argdesc, rffi.cast(rffi.LONGDOUBLE, space.float_w(w_arg))) + return 1 elif letter == "c": s = space.str_w(w_arg) if len(s) != 1: @@ -397,20 +407,31 @@ "Expected string of length one as character")) val = s[0] push_func(add_arg, argdesc, val) + return 1 elif letter == 'u': s = space.unicode_w(w_arg) if len(s) != 1: raise OperationError(space.w_TypeError, w( "Expected unicode string of length one as wide character")) - val = s[0] - push_func(add_arg, argdesc, val) + + val = utf8ord(s) + if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF: + # Utf-16 must be used on systems with a 2 byte wchar_t to + # encode codepoints > 0xFFFF + c1, c2 = create_surrogate_pair(val) + push_func(add_arg, argdesc, wchar_rint(c1)) + push_func(add_arg, argdesc+1, wchar_rint(c2)) + return 2 + else: + push_func(add_arg, argdesc, wchar_rint(val)) + return 1 else: for c in unroll_letters_for_numbers: if letter == c: TP = LL_TYPEMAP[c] val = unwrap_truncate_int(TP, space, w_arg) push_func(add_arg, argdesc, val) - return + return 1 else: raise OperationError(space.w_TypeError, space.wrap("cannot directly write value")) @@ -559,9 +580,9 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: - s = rffi.wcharp2unicode(wcharp_addr) + s = Utf8Str.from_wcharp(wcharp_addr) else: - s = rffi.wcharp2unicoden(wcharp_addr, maxlength) + s = Utf8Str.from_wcharpn(wcharp_addr, maxlength) return space.wrap(s) @unwrap_spec(address=r_uint, maxlength=int) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit