Re: [PR] feat(python): Add array creation/building from buffers [arrow-nanoarrow]

via GitHub Thu, 15 Feb 2024 02:47:34 -0800


jorisvandenbossche commented on code in PR #378:
URL: https://github.com/apache/arrow-nanoarrow/pull/378#discussion_r1490785343



##########
python/src/nanoarrow/c_lib.py:
##########
@@ -120,15 +157,134 @@ def c_array(obj=None, requested_schema=None) -> CArray:
             *obj.__arrow_c_array__(requested_schema=requested_schema_capsule)
         )
 
-    # for pyarrow < 14.0
-    if hasattr(obj, "_export_to_c"):
+    # Try buffer protocol (e.g., numpy arrays or a c_buffer())
+    if _obj_is_buffer(obj):
+        return _c_array_from_pybuffer(obj)
+
+    # Try import of bare capsule
+    if _obj_is_capsule(obj, "arrow_array"):
+        if requested_schema is None:
+            requested_schema_capsule = CSchema.allocate()._capsule
+        else:
+            requested_schema_capsule = requested_schema.__arrow_c_schema__()
+
+        return CArray._import_from_c_capsule(requested_schema_capsule, obj)
+
+    # Try _export_to_c for Array/RecordBatch objects if pyarrow < 14.0
+    if _obj_is_pyarrow_array(obj):
         out = CArray.allocate(CSchema.allocate())
         obj._export_to_c(out._addr(), out.schema._addr())
         return out
-    else:
-        raise TypeError(
-            f"Can't convert object of type {type(obj).__name__} to 
nanoarrow.c_array"
-        )
+
+    # Try import of iterable
+    if _obj_is_iterable(obj):
+        return _c_array_from_iterable(obj, requested_schema)
+
+    raise TypeError(
+        f"Can't convert object of type {type(obj).__name__} to 
nanoarrow.c_array"
+    )
+
+
+def c_array_from_buffers(
+    schema,
+    length: int,
+    buffers: Iterable[Any],
+    null_count: int = -1,
+    offset: int = 0,
+    children: Iterable[Any] = (),
+    validation_level: Literal["full", "default", "minimal", "none"] = 
"default",
+) -> CArray:
+    """Create an ArrowArray wrapper from components
+
+    Given a schema, build an ArrowArray buffer-wise. This allows almost any 
array
+    to be assembled; however, requires some knowledge of the Arrow Columnar
+    specification. This function will do its best to validate the sizes and
+    content of buffers according to ``validation_level``, which can be set
+    to ``"full""`` for maximum safety.
+
+    Parameters
+    ----------
+

Review Comment:
   ```suggestion
   ```



##########
python/src/nanoarrow/c_lib.py:
##########
@@ -75,35 +93,53 @@ def c_schema(obj=None) -> CSchema:
         )
 
 
-def c_array(obj=None, requested_schema=None) -> CArray:
+def c_array(obj, requested_schema=None) -> CArray:
     """ArrowArray wrapper
 
     This class provides a user-facing interface to access the fields of an 
ArrowArray
     as defined in the Arrow C Data interface, holding an optional reference to 
a
     :class:`CSchema` that can be used to safely deserialize the content.
 
     These objects are created using :func:`c_array`, which accepts any 
array-like
-    object according to the Arrow PyCapsule interface.
+    object according to the Arrow PyCapsule interface, Python buffer protocol,
+    or iterable of Python objects.
 
     This Python wrapper allows access to array fields but does not 
automatically
     deserialize their content: use :func:`c_array_view` to validate and 
deserialize
     the content into a more easily inspectable object.
 
     Note that the :class:`CArray` objects returned by ``.child()`` hold strong
-    references to the original ``ArrowSchema`` to avoid copies while 
inspecting an
+    references to the original ``ArrowArray`` to avoid copies while inspecting 
an
     imported structure.
 
+    Parameters
+    ----------
+    obj : array-like
+        An object supporting the Arrow PyCapsule interface, the Python

Review Comment:
   ```suggestion
           An object supporting the Arrow PyCapsule interface, the Python buffer
   ```



##########
python/src/nanoarrow/c_lib.py:
##########
@@ -120,15 +157,134 @@ def c_array(obj=None, requested_schema=None) -> CArray:
             *obj.__arrow_c_array__(requested_schema=requested_schema_capsule)
         )
 
-    # for pyarrow < 14.0
-    if hasattr(obj, "_export_to_c"):
+    # Try buffer protocol (e.g., numpy arrays or a c_buffer())
+    if _obj_is_buffer(obj):
+        return _c_array_from_pybuffer(obj)
+
+    # Try import of bare capsule
+    if _obj_is_capsule(obj, "arrow_array"):
+        if requested_schema is None:
+            requested_schema_capsule = CSchema.allocate()._capsule

Review Comment:
   How does the import work if you don't have an actual schema? (I suppose the 
above allocates an "emtpy" ArrowSchema?)



##########
python/src/nanoarrow/c_lib.py:
##########
@@ -257,7 +413,60 @@ def c_array_view(obj, requested_schema=None) -> CArrayView:
     return CArrayView.from_cpu_array(c_array(obj, requested_schema))
 
 
-def allocate_c_schema():
+def c_buffer(obj, requested_schema=None) -> CBuffer:
+    """Owning, read-only ArrowBuffer wrapper
+
+    If obj implement the Python buffer protocol, ``c_buffer()`` Wraps

Review Comment:
   ```suggestion
       If obj implement the Python buffer protocol, ``c_buffer()`` wraps
   ```



##########
python/src/nanoarrow/c_lib.py:
##########
@@ -120,15 +157,134 @@ def c_array(obj=None, requested_schema=None) -> CArray:
             *obj.__arrow_c_array__(requested_schema=requested_schema_capsule)
         )
 
-    # for pyarrow < 14.0
-    if hasattr(obj, "_export_to_c"):
+    # Try buffer protocol (e.g., numpy arrays or a c_buffer())
+    if _obj_is_buffer(obj):
+        return _c_array_from_pybuffer(obj)
+
+    # Try import of bare capsule
+    if _obj_is_capsule(obj, "arrow_array"):
+        if requested_schema is None:
+            requested_schema_capsule = CSchema.allocate()._capsule
+        else:
+            requested_schema_capsule = requested_schema.__arrow_c_schema__()
+
+        return CArray._import_from_c_capsule(requested_schema_capsule, obj)
+
+    # Try _export_to_c for Array/RecordBatch objects if pyarrow < 14.0
+    if _obj_is_pyarrow_array(obj):
         out = CArray.allocate(CSchema.allocate())
         obj._export_to_c(out._addr(), out.schema._addr())
         return out
-    else:
-        raise TypeError(
-            f"Can't convert object of type {type(obj).__name__} to 
nanoarrow.c_array"
-        )
+
+    # Try import of iterable
+    if _obj_is_iterable(obj):
+        return _c_array_from_iterable(obj, requested_schema)
+
+    raise TypeError(
+        f"Can't convert object of type {type(obj).__name__} to 
nanoarrow.c_array"
+    )
+
+
+def c_array_from_buffers(
+    schema,
+    length: int,
+    buffers: Iterable[Any],
+    null_count: int = -1,
+    offset: int = 0,
+    children: Iterable[Any] = (),
+    validation_level: Literal["full", "default", "minimal", "none"] = 
"default",
+) -> CArray:
+    """Create an ArrowArray wrapper from components
+
+    Given a schema, build an ArrowArray buffer-wise. This allows almost any 
array
+    to be assembled; however, requires some knowledge of the Arrow Columnar
+    specification. This function will do its best to validate the sizes and
+    content of buffers according to ``validation_level``, which can be set
+    to ``"full""`` for maximum safety.
+
+    Parameters
+    ----------
+
+    schema : schema-like
+        The data type of the desired array as sanitized by :func:`c_schema`.
+    length : int
+        The length of the output array.
+    buffers : Iterable of buffer-like or None
+        An iterable of buffers as sanitized by :func:`c_buffer`. Any object
+        supporting the Python Buffer protocol is accepted. Buffer data types
+        are not checked. A buffer value of ``None`` will skip setting a buffer
+        (i.e., that buffer will be of length zero and its pointer will
+        be ``NULL``).
+    null_count : int, optional
+        The number of null values, if known in advance. If -1 (the default),
+        the null count will be calculated based on the validity bitmap. If
+        the validity bitmap was set to ``None``, the calculated null count
+        will be zero.
+    offset : int, optional
+        The logical offset from the start of the array.
+    children : Iterable of array-like
+        An iterable of arrays used to set child fields of the array. Can 
contain
+        any object accepted by :func:`c_array`. Must contain the exact number 
of
+        required children as specifed by ``schema``.
+    validation_level: str, optional
+        One of "none" (no check), "minimal" (check buffer sizes that do not 
require
+        dereferencing buffer content), "default" (check all buffer sizes), or 
"full"
+        (check all buffer sizes and all buffer content).
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> c_array = na.c_array_from_buffers(na.uint8(), 5, [None, b"12345"])
+    >>> na.c_array_view(c_array)
+    <nanoarrow.c_lib.CArrayView>
+    - storage_type: 'uint8'
+    - length: 5
+    - offset: 0
+    - null_count: 0
+    - buffers[2]:
+      - validity <bool[0 b] >
+      - data <uint8[5 b] 49 50 51 52 53>
+    - dictionary: NULL
+    - children[0]:
+    """
+    schema = c_schema(schema)
+    builder = CArrayBuilder.allocate()
+
+    # This is slightly wasteful: it will allocate arrays recursively and we 
are about

Review Comment:
   This is about the _child_ arrays? (because the parent array itself isn't 
released, for that we just set the buffers?)



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -176,6 +178,188 @@ cdef object alloc_c_array_shallow_copy(object base, const 
ArrowArray* c_array) n
     return array_capsule
 
 
+cdef void pycapsule_buffer_deleter(object stream_capsule) noexcept:
+    cdef ArrowBuffer* buffer = <ArrowBuffer*>PyCapsule_GetPointer(
+        stream_capsule, 'nanoarrow_buffer'
+    )
+
+    ArrowBufferReset(buffer)
+    ArrowFree(buffer)
+
+
+cdef object alloc_c_buffer(ArrowBuffer** c_buffer) noexcept:
+    c_buffer[0] = <ArrowBuffer*> ArrowMalloc(sizeof(ArrowBuffer))
+    ArrowBufferInit(c_buffer[0])
+    return PyCapsule_New(c_buffer[0], 'nanoarrow_buffer', 
&pycapsule_buffer_deleter)
+
+cdef void c_deallocate_pybuffer(ArrowBufferAllocator* allocator, uint8_t* ptr, 
int64_t size) noexcept with gil:
+    cdef Py_buffer* buffer = <Py_buffer*>allocator.private_data
+    PyBuffer_Release(buffer)
+    ArrowFree(buffer)
+
+
+cdef ArrowBufferAllocator c_pybuffer_deallocator(Py_buffer* buffer):
+    # This should probably be changed in nanoarrow C; however, currently, the 
deallocator
+    # won't get called if buffer.buf is NULL.
+    if buffer.buf == NULL:
+        PyBuffer_Release(buffer)
+        return ArrowBufferAllocatorDefault()
+
+    cdef Py_buffer* allocator_private = 
<Py_buffer*>ArrowMalloc(sizeof(Py_buffer))
+    if allocator_private == NULL:
+        PyBuffer_Release(buffer)
+        raise MemoryError()
+
+    memcpy(allocator_private, buffer, sizeof(Py_buffer))
+    return 
ArrowBufferDeallocator(<ArrowBufferDeallocatorCallback>&c_deallocate_pybuffer, 
allocator_private)
+
+
+cdef c_arrow_type_from_format(format):
+    # PyBuffer_SizeFromFormat() was added in Python 3.9 (potentially faster)
+    item_size = calcsize(format)
+
+    # Don't allow non-native endian values
+    if sys_byteorder == "little" and (">" in format or "!" in format):
+        raise ValueError(f"Can't convert format '{format}' to Arrow type")
+    elif sys_byteorder == "big" and  "<" in format:
+        raise ValueError(f"Can't convert format '{format}' to Arrow type")
+
+    # Strip system endian specifiers
+    format = format.strip("=@")
+
+    if format == "c":
+        return 0, NANOARROW_TYPE_STRING
+    elif format == "e":
+        return item_size, NANOARROW_TYPE_HALF_FLOAT
+    elif format == "f":
+        return item_size, NANOARROW_TYPE_FLOAT
+    elif format == "d":
+        return item_size, NANOARROW_TYPE_DOUBLE
+
+    # Check for signed integers
+    if format in ("b", "?", "h", "i", "l", "q", "n"):
+        if item_size == 1:
+            return item_size, NANOARROW_TYPE_INT8
+        elif item_size == 2:
+            return item_size, NANOARROW_TYPE_INT16
+        elif item_size == 4:
+            return item_size, NANOARROW_TYPE_INT32
+        elif item_size == 8:
+            return item_size, NANOARROW_TYPE_INT64
+
+    # Check for unsinged integers
+    if format in ("B", "H", "I", "L", "Q", "N"):
+        if item_size == 1:
+            return item_size, NANOARROW_TYPE_UINT8
+        elif item_size == 2:
+            return item_size, NANOARROW_TYPE_UINT16
+        elif item_size == 4:
+            return item_size, NANOARROW_TYPE_UINT32
+        elif item_size == 8:
+            return item_size, NANOARROW_TYPE_UINT64
+
+    # If all else fails, return opaque fixed-size binary
+    return item_size, NANOARROW_TYPE_BINARY
+
+
+cdef int c_format_from_arrow_type(ArrowType type_id, int element_size_bits, 
size_t out_size, char* out):
+    if type_id in (NANOARROW_TYPE_BINARY, NANOARROW_TYPE_FIXED_SIZE_BINARY) 
and element_size_bits > 0:
+        snprintf(out, out_size, "%ds", <int>(element_size_bits // 8))
+        return element_size_bits
+
+    cdef const char* format_const = ""
+    cdef int element_size_bits_calc = 0
+    if type_id == NANOARROW_TYPE_STRING:
+        format_const = "c"
+        element_size_bits_calc = 0
+    elif type_id == NANOARROW_TYPE_BINARY:
+        format_const = "B"
+        element_size_bits_calc = 0
+    elif type_id == NANOARROW_TYPE_BOOL:
+        # Bitmaps export as unspecified binary
+        format_const = "B"
+        element_size_bits_calc = 1
+    elif type_id == NANOARROW_TYPE_INT8:
+        format_const = "b"
+        element_size_bits_calc = 8
+    elif type_id == NANOARROW_TYPE_UINT8:
+        format_const = "B"
+        element_size_bits_calc = 8
+    elif type_id == NANOARROW_TYPE_INT16:
+        format_const = "h"
+        element_size_bits_calc = 16
+    elif type_id == NANOARROW_TYPE_UINT16:
+        format_const = "H"
+        element_size_bits_calc = 16
+    elif type_id in (NANOARROW_TYPE_INT32, NANOARROW_TYPE_INTERVAL_MONTHS):
+        format_const = "i"
+        element_size_bits_calc = 32
+    elif type_id == NANOARROW_TYPE_UINT32:
+        format_const = "I"
+        element_size_bits_calc = 32
+    elif type_id == NANOARROW_TYPE_INT64:
+        format_const = "q"
+        element_size_bits_calc = 64
+    elif type_id == NANOARROW_TYPE_UINT64:
+        format_const = "Q"
+        element_size_bits_calc = 64
+    elif type_id == NANOARROW_TYPE_HALF_FLOAT:
+        format_const = "e"
+        element_size_bits_calc = 16
+    elif type_id == NANOARROW_TYPE_FLOAT:
+        format_const = "f"
+        element_size_bits_calc = 32
+    elif type_id == NANOARROW_TYPE_DOUBLE:
+        format_const = "d"
+        element_size_bits_calc = 64
+    elif type_id == NANOARROW_TYPE_INTERVAL_DAY_TIME:
+        format_const = "ii"
+        element_size_bits_calc = 64
+    elif type_id == NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
+        format_const = "iiq"
+        element_size_bits_calc = 128
+    elif type_id == NANOARROW_TYPE_DECIMAL128:
+        format_const = "16s"
+        element_size_bits_calc = 128
+    elif type_id == NANOARROW_TYPE_DECIMAL256:
+        format_const = "32s"
+        element_size_bits_calc = 256
+    else:
+        raise ValueError(f"Unsupported Arrow type_id for format conversion: 
{type_id}")
+
+    snprintf(out, out_size, "%s", format_const)
+    return element_size_bits_calc
+
+
+cdef object c_buffer_set_pybuffer(object obj, ArrowBuffer** c_buffer):
+    ArrowBufferReset(c_buffer[0])
+
+    cdef Py_buffer buffer
+    cdef int rc = PyObject_GetBuffer(obj, &buffer, PyBUF_FORMAT | 
PyBUF_ANY_CONTIGUOUS)
+    if rc != 0:
+        raise BufferError()
+
+    # Parse the buffer's format string to get the ArrowType and element size
+    try:
+        if buffer.format == NULL:
+            format = "B"
+        else:
+            format = buffer.format.decode("UTF-8")
+    except Exception as e:
+        PyBuffer_Release(&buffer)
+        raise e
+
+    # Transfers ownership of buffer to c_buffer, whose finalizer will be 
called by
+    # the capsule when the capsule is deleted or garbage collected
+    c_buffer[0].data = <uint8_t*>buffer.buf
+    c_buffer[0].size_bytes = <int64_t>buffer.len
+    c_buffer[0].capacity_bytes = 0
+    c_buffer[0].allocator = c_pybuffer_deallocator(&buffer)

Review Comment:
   Ah, I see, you need to call `PyBuffer_Release`, and I assume python will 
ensure it increases the ref on the object / keeps that alive, until that 
release is called (so essentially the same as with the C Data Interface release 
callback)



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -1177,11 +1429,395 @@ cdef class CBufferView:
         buffer.strides = &self._strides
         buffer.suboffsets = NULL
 
-    def __releasebuffer__(self, Py_buffer *buffer):
+    cdef _do_releasebuffer(self, Py_buffer* buffer):
         pass
 
     def __repr__(self):
-        return f"<nanoarrow.c_lib.CBufferView>\n  
{_lib_utils.buffer_view_repr(self)[1:]}"
+        return f"CBufferView({_lib_utils.buffer_view_repr(self)})"
+
+
+cdef class CBuffer:
+    """Wrapper around readable owned buffer content
+
+    Like the CBufferView, the CBuffer represents readable buffer content; 
however,
+    unlike the CBufferView, the CBuffer always represents a valid ArrowBuffer 
C object.
+    """
+    cdef object _base
+    cdef ArrowBuffer* _ptr
+    cdef ArrowType _data_type
+    cdef int _element_size_bits
+    cdef char _format[32]
+    cdef CDevice _device
+    cdef CBufferView _view
+    cdef int _get_buffer_count
+
+    def __cinit__(self):
+        self._base = None
+        self._ptr = NULL
+        self._data_type = NANOARROW_TYPE_BINARY
+        self._element_size_bits = 0
+        self._device = CDEVICE_CPU
+        self._format[0] = 0
+        self._get_buffer_count = 0
+        self._reset_view()
+
+    cdef _assert_valid(self):
+        if self._ptr == NULL:
+            raise RuntimeError("CBuffer is not valid")
+
+    cdef _assert_buffer_count_zero(self):
+        if self._get_buffer_count != 0:
+            raise RuntimeError(
+                f"CBuffer already open ({self._get_buffer_count} ",
+                f"references, {self._writable_get_buffer_count} writable)")
+
+    cdef _reset_view(self):
+        self._view = CBufferView(None, 0, 0, NANOARROW_TYPE_BINARY, 8, 
self._device)
+
+    cdef _populate_view(self):
+        self._assert_valid()
+        self._assert_buffer_count_zero()
+        self._view = CBufferView(
+            self._base, <uintptr_t>self._ptr.data,
+            self._ptr.size_bytes, self._data_type, self._element_size_bits,
+            self._device
+        )
+
+    cdef _refresh_view_if_needed(self):
+        if self._get_buffer_count > 0:
+            return
+
+        self._assert_valid()
+        cdef int addr_equal = self._ptr.data == self._view._ptr.data.as_uint8
+        cdef int size_equal = self._ptr.size_bytes == 
self._view._ptr.size_bytes
+        cdef int types_equal = self._data_type == self._view._data_type
+        cdef int element_size_equal = self._element_size_bits == 
self._view.element_size_bits
+        if addr_equal and size_equal and types_equal and element_size_equal:
+            return
+
+        self._populate_view()
+
+    def set_empty(self):
+        self._assert_buffer_count_zero()
+        if self._ptr == NULL:
+            self._base = alloc_c_buffer(&self._ptr)
+        ArrowBufferReset(self._ptr)
+
+        self._data_type = NANOARROW_TYPE_BINARY
+        self._element_size_bits = 0
+        self._device = CDEVICE_CPU
+        self._reset_view()
+        return self
+
+    def set_pybuffer(self, obj):
+        self._assert_buffer_count_zero()
+        if self._ptr == NULL:
+            self._base = alloc_c_buffer(&self._ptr)
+
+        self.set_format(c_buffer_set_pybuffer(obj, &self._ptr))
+        self._device = CDEVICE_CPU
+        self._reset_view()
+        return self
+
+    def set_format(self, str format):
+        self._assert_buffer_count_zero()
+        element_size_bytes, data_type = c_arrow_type_from_format(format)
+        self._data_type = data_type
+        self._element_size_bits = element_size_bytes * 8
+        format_bytes = format.encode("UTF-8")
+        snprintf(self._format, sizeof(self._format), "%s", <const 
char*>format_bytes)
+        return self
+
+    def set_data_type(self, ArrowType type_id, int element_size_bits=0):
+        self._assert_buffer_count_zero()
+        self._element_size_bits = c_format_from_arrow_type(
+            type_id,
+            element_size_bits,
+            sizeof(self._format),
+            self._format
+        )
+        self._data_type = type_id
+
+        return self
+
+    def _addr(self):
+        self._assert_valid()
+        return <uintptr_t>self._ptr.data
+
+    @property
+    def size_bytes(self):
+        self._assert_valid()
+        return self._ptr.size_bytes
+
+    @property
+    def capacity_bytes(self):
+        self._assert_valid()
+        return self._ptr.capacity_bytes
+
+    @property
+    def data_type(self):
+        return ArrowTypeString(self._data_type).decode("UTF-8")
+
+    @property
+    def data_type_id(self):
+        return self._data_type
+
+    @property
+    def element_size_bits(self):
+        return self._element_size_bits
+
+    @property
+    def item_size(self):
+        self._refresh_view_if_needed()
+        return self._view.item_size

Review Comment:
   The difference between both is bits vs bytes? (if so, we might want to use a 
more consistent naming scheme? item vs element)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add array creation/building from buffers [arrow-nanoarrow]

Reply via email to