[pypy-commit] pypy cffi-char16-char32: in-progress

arigo Sat, 03 Jun 2017 22:04:40 -0700

Author: Armin Rigo <[email protected]>
Branch: cffi-char16-char32
Changeset: r91503:d6d714960021
Date: 2017-06-04 07:01 +0200
http://bitbucket.org/pypy/pypy/changeset/d6d714960021/


Log:    in-progress

diff --git a/pypy/module/_cffi_backend/cffi_opcode.py 
b/pypy/module/_cffi_backend/cffi_opcode.py
--- a/pypy/module/_cffi_backend/cffi_opcode.py
+++ b/pypy/module/_cffi_backend/cffi_opcode.py
@@ -107,8 +107,10 @@
 PRIM_UINTMAX       = 47
 PRIM_FLOATCOMPLEX  = 48
 PRIM_DOUBLECOMPLEX = 49
+PRIM_CHAR16        = 50
+PRIM_CHAR32        = 51
 
-_NUM_PRIM          = 50
+_NUM_PRIM          = 52
 _UNKNOWN_PRIM          = -1
 _UNKNOWN_FLOAT_PRIM    = -2
 _UNKNOWN_LONG_DOUBLE   = -3
@@ -131,8 +133,12 @@
     'float':              PRIM_FLOAT,
     'double':             PRIM_DOUBLE,
     'long double':        PRIM_LONGDOUBLE,
+    'float _Complex':     PRIM_FLOATCOMPLEX,
+    'double _Complex':    PRIM_DOUBLECOMPLEX,
     '_Bool':              PRIM_BOOL,
     'wchar_t':            PRIM_WCHAR,
+    'char16_t':           PRIM_CHAR16,
+    'char32_t':           PRIM_CHAR32,
     'int8_t':             PRIM_INT8,
     'uint8_t':            PRIM_UINT8,
     'int16_t':            PRIM_INT16,
diff --git a/pypy/module/_cffi_backend/ctypeprim.py 
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -42,6 +42,7 @@
     def cast_unicode(self, w_ob):
         space = self.space
         s = space.unicode_w(w_ob)
+        XXXXXXXXXXXXXX
         if len(s) != 1:
             raise oefmt(space.w_TypeError,
                         "cannot cast unicode string of length %d to ctype 
'%s'",
@@ -149,15 +150,15 @@
 
 
 class W_CTypePrimitiveUniChar(W_CTypePrimitiveCharOrUniChar):
-    _attrs_            = ['is_signed']
-    _immutable_fields_ = ['is_signed']
+    _attrs_            = ['is_signed_wchar']
+    _immutable_fields_ = ['is_signed_wchar']
 
     _wchar_is_signed = rfficache.signof_c_type('wchar_t')
 
     def __init__(self, space, size, name, name_position, align):
-        W_CTypePrimitiveUniChar.__init__(self, space, size, name,
-                                         name_position, align)
-        self.is_signed = self._wchar_is_signed and (name == "wchar_t")
+        W_CTypePrimitiveCharOrUniChar.__init__(self, space, size, name,
+                                               name_position, align)
+        self.is_signed_wchar = self._wchar_is_signed and (name == "wchar_t")
         # "char16_t" and "char32_t" are always unsigned
 
     def cast_to_int(self, cdata):
@@ -185,32 +186,41 @@
             w_res = self.convert_to_object(ptr)
         return w_res
 
-    def _convert_to_charN_t(self, w_ob, size):
-        # returns a r_uint.  If size == 2, it is smaller than 0x10000
+    def _convert_to_charN_t(self, w_ob):
+        # returns a r_uint.  If self.size == 2, it is smaller than 0x10000
         space = self.space
         if space.isinstance_w(w_ob, space.w_unicode):
             u = space.unicode_w(w_ob)
-            if len(u) == 1:
-                u = ord(u[0])
-                if size == 2 and u > 0xffff:
+            try:
+                ordinal = wchar_helper.unicode_to_ordinal(u)
+            except ValueError:
+                pass
+            else:
+                if self.size == 2 and ordinal > 0xffff:
                     raise self._convert_error("single character <= 0xFFFF",
                                               w_ob)
-                return r_uint(u)
-            elif size == 4 and len(u) == 2 and ...
-
+                return ordinal
         elif (isinstance(w_ob, cdataobj.W_CData) and
                isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
-               w_ob.ctype.size == 2):
+               w_ob.ctype.size == self.size):
             with w_ob as ptr:
-                return misc.read_raw_ulong_data(ptr, 2)
+                return misc.read_raw_ulong_data(ptr, self.size)
         raise self._convert_error("unicode string of length 1", w_ob)
 
     def convert_from_object(self, cdata, w_ob):
-        ordinal = self._convert_to_char16(w_ob, self.size)
+        ordinal = self._convert_to_charN_t(w_ob)
         misc.write_raw_unsigned_data(cdata, ordinal, self.size)
 
     def unpack_ptr(self, w_ctypeptr, ptr, length):
-        u = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+        if self.size == 2:
+            u = wchar_helper.unicode_from_char16(ptr, length)
+        else:
+            try:
+                u = wchar_helper.unicode_from_char32(ptr, length)
+            except OutOfRange as e:
+                raise oefmt(self.space.w_ValueError,
+                            "char32_t out of range for "
+                            "conversion to unicode: %s", hex(e.ordinal))
         return self.space.newunicode(u)
 
 
diff --git a/pypy/module/_cffi_backend/ctypeptr.py 
b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -91,11 +91,24 @@
             if not space.isinstance_w(w_ob, space.w_unicode):
                 raise self._convert_error("unicode or list or tuple", w_ob)
             s = space.unicode_w(w_ob)
+            XXXXXXXXXXXXXXX
             n = len(s)
             if self.length >= 0 and n > self.length:
                 raise oefmt(space.w_IndexError,
                             "initializer unicode string is too long for '%s' "
                             "(got %d characters)", self.name, n)
+
+
+
+
+            if self.ctitem.size == 2:
+                length = wchar_helper.measure_length_16(ptr, length)
+            else:
+                length = wchar_helper.measure_length_32(ptr, length)
+            XXXX
+
+
+
             unichardata = rffi.cast(rffi.CWCHARP, cdata)
             copy_unicode_to_raw(llunicode(s), unichardata, 0, n)
             if n != self.length:
@@ -134,12 +147,12 @@
                 #
                 # pointer to a wchar_t: builds and returns a unicode
                 if self.is_unichar_ptr_or_array():
-                    cdata = rffi.cast(rffi.CWCHARP, ptr)
-                    if length < 0:
-                        u = rffi.wcharp2unicode(cdata)
+                    from pypy.module._cffi_backend import wchar_helper
+                    if self.ctitem.size == 2:
+                        length = wchar_helper.measure_length_16(ptr, length)
                     else:
-                        u = rffi.wcharp2unicoden(cdata, length)
-                    return space.newunicode(u)
+                        length = wchar_helper.measure_length_32(ptr, length)
+                    return self.ctitem.unpack_ptr(self, ptr, length)
         #
         return W_CType.string(self, cdataobj, maxlen)
 
@@ -304,6 +317,7 @@
             length = space.int_w(space.len(w_init))
         elif space.isinstance_w(w_init, space.w_basestring):
             # from a string, we add the null terminator
+            XXXXXXXXXXXXXXX
             length = space.int_w(space.len(w_init)) + 1
         elif self.is_file:
             result = self.prepare_file(w_init)
diff --git a/pypy/module/_cffi_backend/misc.py 
b/pypy/module/_cffi_backend/misc.py
--- a/pypy/module/_cffi_backend/misc.py
+++ b/pypy/module/_cffi_backend/misc.py
@@ -296,6 +296,7 @@
         return (w_value, space.int_w(space.len(w_value)))
     elif space.isinstance_w(w_value, space.w_basestring):
         # from a string, we add the null terminator
+        XXXXXXXXXX
         return (w_value, space.int_w(space.len(w_value)) + 1)
     else:
         explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/realize_c_type.py 
b/pypy/module/_cffi_backend/realize_c_type.py
--- a/pypy/module/_cffi_backend/realize_c_type.py
+++ b/pypy/module/_cffi_backend/realize_c_type.py
@@ -73,6 +73,8 @@
         "uintmax_t",
         "float _Complex",
         "double _Complex",
+        "char16_t",
+        "char32_t",
         ]
     assert len(NAMES) == cffi_opcode._NUM_PRIM
 
diff --git a/pypy/module/_cffi_backend/src/parse_c_type.c 
b/pypy/module/_cffi_backend/src/parse_c_type.c
--- a/pypy/module/_cffi_backend/src/parse_c_type.c
+++ b/pypy/module/_cffi_backend/src/parse_c_type.c
@@ -505,6 +505,7 @@
 
     case '1':
         if (size == 8 && !memcmp(p, "uint16", 6)) return _CFFI_PRIM_UINT16;
+        if (size == 8 && !memcmp(p, "char16", 6)) return _CFFI_PRIM_CHAR16;
         break;
 
     case '2':
@@ -513,6 +514,7 @@
 
     case '3':
         if (size == 8 && !memcmp(p, "uint32", 6)) return _CFFI_PRIM_UINT32;
+        if (size == 8 && !memcmp(p, "char32", 6)) return _CFFI_PRIM_CHAR32;
         break;
 
     case '4':
diff --git a/pypy/module/_cffi_backend/src/parse_c_type.h 
b/pypy/module/_cffi_backend/src/parse_c_type.h
--- a/pypy/module/_cffi_backend/src/parse_c_type.h
+++ b/pypy/module/_cffi_backend/src/parse_c_type.h
@@ -80,8 +80,10 @@
 #define _CFFI_PRIM_UINTMAX      47
 #define _CFFI_PRIM_FLOATCOMPLEX 48
 #define _CFFI_PRIM_DOUBLECOMPLEX 49
+#define _CFFI_PRIM_CHAR16       50
+#define _CFFI_PRIM_CHAR32       51
 
-#define _CFFI__NUM_PRIM         50
+#define _CFFI__NUM_PRIM         52
 #define _CFFI__UNKNOWN_PRIM           (-1)
 #define _CFFI__UNKNOWN_FLOAT_PRIM     (-2)
 #define _CFFI__UNKNOWN_LONG_DOUBLE    (-3)
diff --git a/pypy/module/_cffi_backend/test/_backend_test_c.py 
b/pypy/module/_cffi_backend/test/_backend_test_c.py
--- a/pypy/module/_cffi_backend/test/_backend_test_c.py
+++ b/pypy/module/_cffi_backend/test/_backend_test_c.py
@@ -1925,7 +1925,11 @@
         assert string(a, 8).startswith(b'ABC')  # may contain additional 
garbage
 
 def test_string_wchar():
-    BWChar = new_primitive_type("wchar_t")
+    for typename in ["wchar_t", "char16_t", "char32_t"]:
+        _test_string_wchar_variant(typename)
+
+def _test_string_wchar_variant(typename):
+    BWChar = new_primitive_type(typename)
     assert string(cast(BWChar, 42)) == u+'*'
     assert string(cast(BWChar, 0x4253)) == u+'\u4253'
     assert string(cast(BWChar, 0)) == u+'\x00'
@@ -2088,6 +2092,10 @@
 
 def test_wchar():
     _test_wchar_variant("wchar_t")
+    if sys.platform.startswith("linux"):
+        BWChar = new_primitive_type("wchar_t")
+        assert sizeof(BWChar) == 4
+        assert int(cast(BWChar, -1)) == -1        # signed, on linux
 
 def test_char16():
     BChar16 = new_primitive_type("char16_t")
@@ -2231,6 +2239,22 @@
         x = cast(BWChar, -1)
         py.test.raises(ValueError, string, x)
 
+def test_wchar_variants_mix():
+    BWChar  = new_primitive_type("wchar_t")
+    BChar16 = new_primitive_type("char16_t")
+    BChar32 = new_primitive_type("char32_t")
+    assert int(cast(BChar32, cast(BChar16, -2))) == 0xfffe
+    assert int(cast(BWChar, cast(BChar16, -2))) == 0xfffe
+    assert int(cast(BChar16, cast(BChar32, 0x0001f345))) == 0xf345
+    assert int(cast(BChar16, cast(BWChar, 0x0001f345))) == 0xf345
+    #
+    BChar16A = new_array_type(new_pointer_type(BChar16), None)
+    BChar32A = new_array_type(new_pointer_type(BChar32), None)
+    x = cast(BChar32, 'A')
+    py.test.raises(TypeError, newp, BChar16A, [x])
+    x = cast(BChar16, 'A')
+    py.test.raises(TypeError, newp, BChar32A, [x])
+
 def test_keepalive_struct():
     # exception to the no-keepalive rule: p=newp(BStructPtr) returns a
     # pointer owning the memory, and p[0] returns a pointer to the
@@ -3457,14 +3481,15 @@
     py.test.raises(TypeError, "p[1:5] = u+'XYZT'")
     py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]")
     #
-    BUniChar = new_primitive_type("wchar_t")
-    BArray = new_array_type(new_pointer_type(BUniChar), None)
-    p = newp(BArray, u+"foobar")
-    p[2:5] = [u+"*", u+"Z", u+"T"]
-    p[1:3] = u+"XY"
-    assert list(p) == [u+"f", u+"X", u+"Y", u+"Z", u+"T", u+"r", u+"\x00"]
-    py.test.raises(TypeError, "p[1:5] = b'XYZT'")
-    py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]")
+    for typename in ["wchar_t", "char16_t", "char32_t"]:
+        BUniChar = new_primitive_type(typename)
+        BArray = new_array_type(new_pointer_type(BUniChar), None)
+        p = newp(BArray, u+"foobar")
+        p[2:5] = [u+"*", u+"Z", u+"T"]
+        p[1:3] = u+"XY"
+        assert list(p) == [u+"f", u+"X", u+"Y", u+"Z", u+"T", u+"r", u+"\x00"]
+        py.test.raises(TypeError, "p[1:5] = b'XYZT'")
+        py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]")
 
 def test_void_p_arithmetic():
     BVoid = new_void_type()
@@ -3777,10 +3802,12 @@
     p0 = p
     assert unpack(p, 10) == b"abc\x00def\x00\x00\x00"
     assert unpack(p+1, 5) == b"bc\x00de"
-    BWChar = new_primitive_type("wchar_t")
-    BArray = new_array_type(new_pointer_type(BWChar), 10)   # wchar_t[10]
-    p = newp(BArray, u"abc\x00def")
-    assert unpack(p, 10) == u"abc\x00def\x00\x00\x00"
+
+    for typename in ["wchar_t", "char16_t", "char32_t"]:
+        BWChar = new_primitive_type(typename)
+        BArray = new_array_type(new_pointer_type(BWChar), 10)   # wchar_t[10]
+        p = newp(BArray, u"abc\x00def")
+        assert unpack(p, 10) == u"abc\x00def\x00\x00\x00"
 
     for typename, samples in [
             ("uint8_t",  [0, 2**8-1]),
diff --git a/pypy/module/_cffi_backend/wchar_helper.py 
b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -1,7 +1,8 @@
+from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
 from rpython.rtyper.lltypesystem import lltype, rffi
 
-SIZE_UNICHAR = rffi.sizeof(lltype.UniChar)
+SIZE_UNICODE = rffi.sizeof(lltype.UniChar)
 
 
 if SIZE_UNICODE == 4:
@@ -15,3 +16,102 @@
             ordinal = intmask(ordinal - 0x10000)
             return (unichr(0xD800 | (ordinal >> 10)) +
                     unichr(0xDC00 | (ordinal & 0x3FF)))
+
+def is_surrogate(u, index):
+    return (index + 1 < len(u) and
+            unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
+            unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))
+
+def as_surrogate(u, index):
+    ordinal = (ord(u[index + 0]) - 0xD800) << 10
+    ordinal |= (ord(u[index + 1]) - 0xDC00)
+    return r_uint(ordinal + 0x10000)
+
+def unicode_to_ordinal(u):
+    if len(u) == 1:
+        u = ord(u[0])
+        return r_uint(u)
+    elif SIZE_UNICODE == 2:
+        if len(u) == 2 and is_surrogate(u, 0):
+            return r_uint(as_surrogate(u, 0))
+    raise ValueError
+
+
+class OutOfRange(Exception):
+    def __init__(self, ordinal):
+        ordinal = intmask(rffi.cast(rffi.INT, ordinal))
+        self.ordinal = ordinal
+
+
+if SIZE_UNICODE == 2:
+    def unicode_from_char32(ptr, length):
+        ptr = rffi.cast(rffi.UINTP, ptr)
+        alloc = length
+        for i in range(length):
+            if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF:
+                alloc += 1
+
+        u = [u'\x00'] * alloc
+        j = 0
+        for i in range(length):
+            ordinal = rffi.cast(lltype.Unsigned, ptr[i])
+            if ordinal > 0xFFFF:
+                if ordinal > 0x10FFFF:
+                    raise OutOfRange(ordinal)
+                ordinal = intmask(ordinal - 0x10000)
+                u[j] = unichr(0xD800 | (ordinal >> 10))
+                j += 1
+                u[j] = unichr(0xDC00 | (ordinal & 0x3FF))
+                j += 1
+            else:
+                u[j] = unichr(intmask(ordinal))
+                j += 1
+        assert j == len(u)
+        return u''.join(u)
+
+    def unicode_from_char16(ptr, length):
+        return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+
+else:
+    def unicode_from_char32(ptr, length):
+        return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+
+    def unicode_from_char16(ptr, length):
+        ptr = rffi.cast(rffi.USHORTP, ptr)
+        u = [u'\x00'] * length
+        i = 0
+        j = 0
+        while j < length:
+            ch = intmask(ptr[j])
+            j += 1
+            if 0xD800 <= ch <= 0xDBFF and j < length:
+                ch2 = intmask(ptr[j])
+                if 0xDC00 <= ch2 <= 0xDFFF:
+                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
+                    j += 1
+            u[i] = unichr(ch)
+            i += 1
+        del u[i:]
+        return u''.join(u)
+
+
[email protected]()
+def _measure_length(ptr, maxlen):
+    result = 0
+    if maxlen < 0:
+        while intmask(ptr[result]) != 0:
+            result += 1
+    else:
+        while result < maxlen and intmask(ptr[result]) != 0:
+            result += 1
+    return result
+
+def measure_length_16(ptr, maxlen=-1):
+    return _measure_length(rffi.cast(rffi.USHORTP, ptr), maxlen)
+
+def measure_length_32(ptr, maxlen=-1):
+    return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
+
+
+def unicode_to_char16(u, ptr):
+    XXX
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy cffi-char16-char32: in-progress

Reply via email to