Author: Armin Rigo <[email protected]>
Branch: cffi-char16-char32
Changeset: r91503:d6d714960021
Date: 2017-06-04 07:01 +0200
http://bitbucket.org/pypy/pypy/changeset/d6d714960021/
Log: in-progress
diff --git a/pypy/module/_cffi_backend/cffi_opcode.py
b/pypy/module/_cffi_backend/cffi_opcode.py
--- a/pypy/module/_cffi_backend/cffi_opcode.py
+++ b/pypy/module/_cffi_backend/cffi_opcode.py
@@ -107,8 +107,10 @@
PRIM_UINTMAX = 47
PRIM_FLOATCOMPLEX = 48
PRIM_DOUBLECOMPLEX = 49
+PRIM_CHAR16 = 50
+PRIM_CHAR32 = 51
-_NUM_PRIM = 50
+_NUM_PRIM = 52
_UNKNOWN_PRIM = -1
_UNKNOWN_FLOAT_PRIM = -2
_UNKNOWN_LONG_DOUBLE = -3
@@ -131,8 +133,12 @@
'float': PRIM_FLOAT,
'double': PRIM_DOUBLE,
'long double': PRIM_LONGDOUBLE,
+ 'float _Complex': PRIM_FLOATCOMPLEX,
+ 'double _Complex': PRIM_DOUBLECOMPLEX,
'_Bool': PRIM_BOOL,
'wchar_t': PRIM_WCHAR,
+ 'char16_t': PRIM_CHAR16,
+ 'char32_t': PRIM_CHAR32,
'int8_t': PRIM_INT8,
'uint8_t': PRIM_UINT8,
'int16_t': PRIM_INT16,
diff --git a/pypy/module/_cffi_backend/ctypeprim.py
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -42,6 +42,7 @@
def cast_unicode(self, w_ob):
space = self.space
s = space.unicode_w(w_ob)
+ XXXXXXXXXXXXXX
if len(s) != 1:
raise oefmt(space.w_TypeError,
"cannot cast unicode string of length %d to ctype
'%s'",
@@ -149,15 +150,15 @@
class W_CTypePrimitiveUniChar(W_CTypePrimitiveCharOrUniChar):
- _attrs_ = ['is_signed']
- _immutable_fields_ = ['is_signed']
+ _attrs_ = ['is_signed_wchar']
+ _immutable_fields_ = ['is_signed_wchar']
_wchar_is_signed = rfficache.signof_c_type('wchar_t')
def __init__(self, space, size, name, name_position, align):
- W_CTypePrimitiveUniChar.__init__(self, space, size, name,
- name_position, align)
- self.is_signed = self._wchar_is_signed and (name == "wchar_t")
+ W_CTypePrimitiveCharOrUniChar.__init__(self, space, size, name,
+ name_position, align)
+ self.is_signed_wchar = self._wchar_is_signed and (name == "wchar_t")
# "char16_t" and "char32_t" are always unsigned
def cast_to_int(self, cdata):
@@ -185,32 +186,41 @@
w_res = self.convert_to_object(ptr)
return w_res
- def _convert_to_charN_t(self, w_ob, size):
- # returns a r_uint. If size == 2, it is smaller than 0x10000
+ def _convert_to_charN_t(self, w_ob):
+ # returns a r_uint. If self.size == 2, it is smaller than 0x10000
space = self.space
if space.isinstance_w(w_ob, space.w_unicode):
u = space.unicode_w(w_ob)
- if len(u) == 1:
- u = ord(u[0])
- if size == 2 and u > 0xffff:
+ try:
+ ordinal = wchar_helper.unicode_to_ordinal(u)
+ except ValueError:
+ pass
+ else:
+ if self.size == 2 and ordinal > 0xffff:
raise self._convert_error("single character <= 0xFFFF",
w_ob)
- return r_uint(u)
- elif size == 4 and len(u) == 2 and ...
-
+ return ordinal
elif (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
- w_ob.ctype.size == 2):
+ w_ob.ctype.size == self.size):
with w_ob as ptr:
- return misc.read_raw_ulong_data(ptr, 2)
+ return misc.read_raw_ulong_data(ptr, self.size)
raise self._convert_error("unicode string of length 1", w_ob)
def convert_from_object(self, cdata, w_ob):
- ordinal = self._convert_to_char16(w_ob, self.size)
+ ordinal = self._convert_to_charN_t(w_ob)
misc.write_raw_unsigned_data(cdata, ordinal, self.size)
def unpack_ptr(self, w_ctypeptr, ptr, length):
- u = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+ if self.size == 2:
+ u = wchar_helper.unicode_from_char16(ptr, length)
+ else:
+ try:
+ u = wchar_helper.unicode_from_char32(ptr, length)
+ except OutOfRange as e:
+ raise oefmt(self.space.w_ValueError,
+ "char32_t out of range for "
+ "conversion to unicode: %s", hex(e.ordinal))
return self.space.newunicode(u)
diff --git a/pypy/module/_cffi_backend/ctypeptr.py
b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -91,11 +91,24 @@
if not space.isinstance_w(w_ob, space.w_unicode):
raise self._convert_error("unicode or list or tuple", w_ob)
s = space.unicode_w(w_ob)
+ XXXXXXXXXXXXXXX
n = len(s)
if self.length >= 0 and n > self.length:
raise oefmt(space.w_IndexError,
"initializer unicode string is too long for '%s' "
"(got %d characters)", self.name, n)
+
+
+
+
+ if self.ctitem.size == 2:
+ length = wchar_helper.measure_length_16(ptr, length)
+ else:
+ length = wchar_helper.measure_length_32(ptr, length)
+ XXXX
+
+
+
unichardata = rffi.cast(rffi.CWCHARP, cdata)
copy_unicode_to_raw(llunicode(s), unichardata, 0, n)
if n != self.length:
@@ -134,12 +147,12 @@
#
# pointer to a wchar_t: builds and returns a unicode
if self.is_unichar_ptr_or_array():
- cdata = rffi.cast(rffi.CWCHARP, ptr)
- if length < 0:
- u = rffi.wcharp2unicode(cdata)
+ from pypy.module._cffi_backend import wchar_helper
+ if self.ctitem.size == 2:
+ length = wchar_helper.measure_length_16(ptr, length)
else:
- u = rffi.wcharp2unicoden(cdata, length)
- return space.newunicode(u)
+ length = wchar_helper.measure_length_32(ptr, length)
+ return self.ctitem.unpack_ptr(self, ptr, length)
#
return W_CType.string(self, cdataobj, maxlen)
@@ -304,6 +317,7 @@
length = space.int_w(space.len(w_init))
elif space.isinstance_w(w_init, space.w_basestring):
# from a string, we add the null terminator
+ XXXXXXXXXXXXXXX
length = space.int_w(space.len(w_init)) + 1
elif self.is_file:
result = self.prepare_file(w_init)
diff --git a/pypy/module/_cffi_backend/misc.py
b/pypy/module/_cffi_backend/misc.py
--- a/pypy/module/_cffi_backend/misc.py
+++ b/pypy/module/_cffi_backend/misc.py
@@ -296,6 +296,7 @@
return (w_value, space.int_w(space.len(w_value)))
elif space.isinstance_w(w_value, space.w_basestring):
# from a string, we add the null terminator
+ XXXXXXXXXX
return (w_value, space.int_w(space.len(w_value)) + 1)
else:
explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/realize_c_type.py
b/pypy/module/_cffi_backend/realize_c_type.py
--- a/pypy/module/_cffi_backend/realize_c_type.py
+++ b/pypy/module/_cffi_backend/realize_c_type.py
@@ -73,6 +73,8 @@
"uintmax_t",
"float _Complex",
"double _Complex",
+ "char16_t",
+ "char32_t",
]
assert len(NAMES) == cffi_opcode._NUM_PRIM
diff --git a/pypy/module/_cffi_backend/src/parse_c_type.c
b/pypy/module/_cffi_backend/src/parse_c_type.c
--- a/pypy/module/_cffi_backend/src/parse_c_type.c
+++ b/pypy/module/_cffi_backend/src/parse_c_type.c
@@ -505,6 +505,7 @@
case '1':
if (size == 8 && !memcmp(p, "uint16", 6)) return _CFFI_PRIM_UINT16;
+ if (size == 8 && !memcmp(p, "char16", 6)) return _CFFI_PRIM_CHAR16;
break;
case '2':
@@ -513,6 +514,7 @@
case '3':
if (size == 8 && !memcmp(p, "uint32", 6)) return _CFFI_PRIM_UINT32;
+ if (size == 8 && !memcmp(p, "char32", 6)) return _CFFI_PRIM_CHAR32;
break;
case '4':
diff --git a/pypy/module/_cffi_backend/src/parse_c_type.h
b/pypy/module/_cffi_backend/src/parse_c_type.h
--- a/pypy/module/_cffi_backend/src/parse_c_type.h
+++ b/pypy/module/_cffi_backend/src/parse_c_type.h
@@ -80,8 +80,10 @@
#define _CFFI_PRIM_UINTMAX 47
#define _CFFI_PRIM_FLOATCOMPLEX 48
#define _CFFI_PRIM_DOUBLECOMPLEX 49
+#define _CFFI_PRIM_CHAR16 50
+#define _CFFI_PRIM_CHAR32 51
-#define _CFFI__NUM_PRIM 50
+#define _CFFI__NUM_PRIM 52
#define _CFFI__UNKNOWN_PRIM (-1)
#define _CFFI__UNKNOWN_FLOAT_PRIM (-2)
#define _CFFI__UNKNOWN_LONG_DOUBLE (-3)
diff --git a/pypy/module/_cffi_backend/test/_backend_test_c.py
b/pypy/module/_cffi_backend/test/_backend_test_c.py
--- a/pypy/module/_cffi_backend/test/_backend_test_c.py
+++ b/pypy/module/_cffi_backend/test/_backend_test_c.py
@@ -1925,7 +1925,11 @@
assert string(a, 8).startswith(b'ABC') # may contain additional
garbage
def test_string_wchar():
- BWChar = new_primitive_type("wchar_t")
+ for typename in ["wchar_t", "char16_t", "char32_t"]:
+ _test_string_wchar_variant(typename)
+
+def _test_string_wchar_variant(typename):
+ BWChar = new_primitive_type(typename)
assert string(cast(BWChar, 42)) == u+'*'
assert string(cast(BWChar, 0x4253)) == u+'\u4253'
assert string(cast(BWChar, 0)) == u+'\x00'
@@ -2088,6 +2092,10 @@
def test_wchar():
_test_wchar_variant("wchar_t")
+ if sys.platform.startswith("linux"):
+ BWChar = new_primitive_type("wchar_t")
+ assert sizeof(BWChar) == 4
+ assert int(cast(BWChar, -1)) == -1 # signed, on linux
def test_char16():
BChar16 = new_primitive_type("char16_t")
@@ -2231,6 +2239,22 @@
x = cast(BWChar, -1)
py.test.raises(ValueError, string, x)
+def test_wchar_variants_mix():
+ BWChar = new_primitive_type("wchar_t")
+ BChar16 = new_primitive_type("char16_t")
+ BChar32 = new_primitive_type("char32_t")
+ assert int(cast(BChar32, cast(BChar16, -2))) == 0xfffe
+ assert int(cast(BWChar, cast(BChar16, -2))) == 0xfffe
+ assert int(cast(BChar16, cast(BChar32, 0x0001f345))) == 0xf345
+ assert int(cast(BChar16, cast(BWChar, 0x0001f345))) == 0xf345
+ #
+ BChar16A = new_array_type(new_pointer_type(BChar16), None)
+ BChar32A = new_array_type(new_pointer_type(BChar32), None)
+ x = cast(BChar32, 'A')
+ py.test.raises(TypeError, newp, BChar16A, [x])
+ x = cast(BChar16, 'A')
+ py.test.raises(TypeError, newp, BChar32A, [x])
+
def test_keepalive_struct():
# exception to the no-keepalive rule: p=newp(BStructPtr) returns a
# pointer owning the memory, and p[0] returns a pointer to the
@@ -3457,14 +3481,15 @@
py.test.raises(TypeError, "p[1:5] = u+'XYZT'")
py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]")
#
- BUniChar = new_primitive_type("wchar_t")
- BArray = new_array_type(new_pointer_type(BUniChar), None)
- p = newp(BArray, u+"foobar")
- p[2:5] = [u+"*", u+"Z", u+"T"]
- p[1:3] = u+"XY"
- assert list(p) == [u+"f", u+"X", u+"Y", u+"Z", u+"T", u+"r", u+"\x00"]
- py.test.raises(TypeError, "p[1:5] = b'XYZT'")
- py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]")
+ for typename in ["wchar_t", "char16_t", "char32_t"]:
+ BUniChar = new_primitive_type(typename)
+ BArray = new_array_type(new_pointer_type(BUniChar), None)
+ p = newp(BArray, u+"foobar")
+ p[2:5] = [u+"*", u+"Z", u+"T"]
+ p[1:3] = u+"XY"
+ assert list(p) == [u+"f", u+"X", u+"Y", u+"Z", u+"T", u+"r", u+"\x00"]
+ py.test.raises(TypeError, "p[1:5] = b'XYZT'")
+ py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]")
def test_void_p_arithmetic():
BVoid = new_void_type()
@@ -3777,10 +3802,12 @@
p0 = p
assert unpack(p, 10) == b"abc\x00def\x00\x00\x00"
assert unpack(p+1, 5) == b"bc\x00de"
- BWChar = new_primitive_type("wchar_t")
- BArray = new_array_type(new_pointer_type(BWChar), 10) # wchar_t[10]
- p = newp(BArray, u"abc\x00def")
- assert unpack(p, 10) == u"abc\x00def\x00\x00\x00"
+
+ for typename in ["wchar_t", "char16_t", "char32_t"]:
+ BWChar = new_primitive_type(typename)
+ BArray = new_array_type(new_pointer_type(BWChar), 10) # wchar_t[10]
+ p = newp(BArray, u"abc\x00def")
+ assert unpack(p, 10) == u"abc\x00def\x00\x00\x00"
for typename, samples in [
("uint8_t", [0, 2**8-1]),
diff --git a/pypy/module/_cffi_backend/wchar_helper.py
b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -1,7 +1,8 @@
+from rpython.rlib.objectmodel import specialize
from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
from rpython.rtyper.lltypesystem import lltype, rffi
-SIZE_UNICHAR = rffi.sizeof(lltype.UniChar)
+SIZE_UNICODE = rffi.sizeof(lltype.UniChar)
if SIZE_UNICODE == 4:
@@ -15,3 +16,102 @@
ordinal = intmask(ordinal - 0x10000)
return (unichr(0xD800 | (ordinal >> 10)) +
unichr(0xDC00 | (ordinal & 0x3FF)))
+
+def is_surrogate(u, index):
+ return (index + 1 < len(u) and
+ unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
+ unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))
+
+def as_surrogate(u, index):
+ ordinal = (ord(u[index + 0]) - 0xD800) << 10
+ ordinal |= (ord(u[index + 1]) - 0xDC00)
+ return r_uint(ordinal + 0x10000)
+
+def unicode_to_ordinal(u):
+ if len(u) == 1:
+ u = ord(u[0])
+ return r_uint(u)
+ elif SIZE_UNICODE == 2:
+ if len(u) == 2 and is_surrogate(u, 0):
+ return r_uint(as_surrogate(u, 0))
+ raise ValueError
+
+
+class OutOfRange(Exception):
+ def __init__(self, ordinal):
+ ordinal = intmask(rffi.cast(rffi.INT, ordinal))
+ self.ordinal = ordinal
+
+
+if SIZE_UNICODE == 2:
+ def unicode_from_char32(ptr, length):
+ ptr = rffi.cast(rffi.UINTP, ptr)
+ alloc = length
+ for i in range(length):
+ if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF:
+ alloc += 1
+
+ u = [u'\x00'] * alloc
+ j = 0
+ for i in range(length):
+ ordinal = rffi.cast(lltype.Unsigned, ptr[i])
+ if ordinal > 0xFFFF:
+ if ordinal > 0x10FFFF:
+ raise OutOfRange(ordinal)
+ ordinal = intmask(ordinal - 0x10000)
+ u[j] = unichr(0xD800 | (ordinal >> 10))
+ j += 1
+ u[j] = unichr(0xDC00 | (ordinal & 0x3FF))
+ j += 1
+ else:
+ u[j] = unichr(intmask(ordinal))
+ j += 1
+ assert j == len(u)
+ return u''.join(u)
+
+ def unicode_from_char16(ptr, length):
+ return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+
+else:
+ def unicode_from_char32(ptr, length):
+ return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+
+ def unicode_from_char16(ptr, length):
+ ptr = rffi.cast(rffi.USHORTP, ptr)
+ u = [u'\x00'] * length
+ i = 0
+ j = 0
+ while j < length:
+ ch = intmask(ptr[j])
+ j += 1
+ if 0xD800 <= ch <= 0xDBFF and j < length:
+ ch2 = intmask(ptr[j])
+ if 0xDC00 <= ch2 <= 0xDFFF:
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
+ j += 1
+ u[i] = unichr(ch)
+ i += 1
+ del u[i:]
+ return u''.join(u)
+
+
[email protected]()
+def _measure_length(ptr, maxlen):
+ result = 0
+ if maxlen < 0:
+ while intmask(ptr[result]) != 0:
+ result += 1
+ else:
+ while result < maxlen and intmask(ptr[result]) != 0:
+ result += 1
+ return result
+
+def measure_length_16(ptr, maxlen=-1):
+ return _measure_length(rffi.cast(rffi.USHORTP, ptr), maxlen)
+
+def measure_length_32(ptr, maxlen=-1):
+ return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
+
+
+def unicode_to_char16(u, ptr):
+ XXX
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit