(arrow-nanoarrow) branch main updated: feat(python): Create string/binary arrays from iterables (#430)

paleolimbot Tue, 16 Apr 2024 08:51:09 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 09481518 feat(python): Create string/binary arrays from iterables 
(#430)
09481518 is described below

commit 094815189e43e4bb37bafef8dab124ed1533066b
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Apr 16 12:50:54 2024 -0300

    feat(python): Create string/binary arrays from iterables (#430)
    
    This PR adds support for building string and binary arrays via iterable.
    
    It also cleans up a few parts of #426 that resulted in the wheel builds
    failing for (at least) PyPy 3.8 and 3.9. We can circle back to the
    performance of building from iterables (and whether or not `pack_into()`
    is essential) when all the wheels are building reliably.
    
    ```python
    import nanoarrow as na
    
    strings = ["pizza", "yogurt", "noodles", "peanut butter sandwiches"]
    
    na.Array(strings, na.string())
    #> nanoarrow.Array<string>[4]
    #> 'pizza'
    #> 'yogurt'
    #> 'noodles'
    #> 'peanut butter sandwiches'
    
    na.Array((s.encode() for s in strings), na.binary())
    #> nanoarrow.Array<binary>[4]
    #> b'pizza'
    #> b'yogurt'
    #> b'noodles'
    #> b'peanut butter sandwiches'
    ```
    
    The "build from iterable" code is now sufficiently complicated that it
    should be separated out. I did an initial attempt at that for this PR;
    however, it scrambles things up a bit and is complicated by the
    interdependence between the functions that sanitize arguments (e.g.,
    `c_schema()`, `c_array()`) and the functions that build from iterable.
    
    Currently faster for strings and slightly slower for bytes than pyarrow.
    
    ```python
    from itertools import cycle, islice
    import nanoarrow as na
    import pyarrow as pa
    
    strings = ["pizza", "yogurt", "noodles", "peanut butter sandwiches"]
    binary = [s.encode() for s in strings]
    
    def many_strings():
        return islice(cycle(strings), int(1e6))
    
    def many_strings_with_nulls():
        return islice(cycle(strings + [None]), int(1e6))
    
    def many_bytes():
        return islice(cycle(binary), int(1e6))
    
    def many_bytes_with_nulls():
        return islice(cycle(binary + [None]), int(1e6))
    
    %timeit pa.array(many_strings(), pa.string())
    #> 23.4 ms ± 488 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    %timeit na.c_array(many_strings(), na.string())
    #> 14.3 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    
    %timeit pa.array(many_strings_with_nulls(), pa.string())
    #> 21.4 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    %timeit na.c_array(many_strings_with_nulls(), na.string())
    #> 17.1 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    
    %timeit pa.array(many_bytes(), pa.binary())
    #> 19.7 ms ± 283 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    %timeit na.c_array(many_bytes(), na.binary())
    #> 16.3 ms ± 136 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    
    %timeit pa.array(many_bytes_with_nulls(), pa.binary())
    #> 17.6 ms ± 37.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit na.c_array(many_bytes_with_nulls(), na.binary())
    #> 19 ms ± 378 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    ```
---
 python/bootstrap.py           |  2 +-
 python/src/nanoarrow/_lib.pyx | 65 ++++++++++++++++++++++++++++++++++---------
 python/src/nanoarrow/c_lib.py | 24 +++++++++++++++-
 python/tests/test_c_array.py  | 40 +++++++++++++++++++++++++-
 python/tests/test_c_buffer.py | 19 +++++++++++--
 python/tests/test_iterator.py | 28 +++----------------
 6 files changed, 135 insertions(+), 43 deletions(-)

diff --git a/python/bootstrap.py b/python/bootstrap.py
index 6395d128..f80c397c 100644
--- a/python/bootstrap.py
+++ b/python/bootstrap.py
@@ -23,7 +23,7 @@ import tempfile
 import warnings
 
 
-# Generate the nanoarrow_c.pxd file used by the Cython extension
+# Generate the nanoarrow_c.pxd file used by the Cython extensions
 class NanoarrowPxdGenerator:
     def __init__(self):
         self._define_regexes()
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 463f07ce..e215f01a 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -36,6 +36,7 @@ from libc.string cimport memcpy
 from libc.stdio cimport snprintf
 from cpython.bytes cimport PyBytes_FromStringAndSize
 from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer, 
PyCapsule_IsValid
+from cpython.unicode cimport PyUnicode_AsUTF8AndSize
 from cpython cimport (
     Py_buffer,
     PyObject_CheckBuffer,
@@ -1995,7 +1996,8 @@ cdef class CBufferBuilder:
         cdef int code
 
         struct_obj = Struct(self._buffer._format)
-        pack_into = struct_obj.pack_into
+        pack = struct_obj.pack
+        write = self.write
 
         # If an object has a length, we can avoid extra allocations
         if hasattr(obj, "__len__"):
@@ -2009,22 +2011,12 @@ cdef class CBufferBuilder:
         if self._buffer._data_type in (NANOARROW_TYPE_INTERVAL_DAY_TIME,
                                        NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO):
             for item in obj:
-                code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element)
-                if code != NANOARROW_OK:
-                    Error.raise_error("ArrowBufferReserve()", code)
-
-                pack_into(self, self._buffer._ptr.size_bytes, *item)
-                self._buffer._ptr.size_bytes += bytes_per_element
+                write(pack(*item))
                 n_values += 1
 
         else:
             for item in obj:
-                code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element)
-                if code != NANOARROW_OK:
-                    Error.raise_error("ArrowBufferReserve()", code)
-
-                pack_into(self, self._buffer._ptr.size_bytes, item)
-                self._buffer._ptr.size_bytes += bytes_per_element
+                write(pack(item))
                 n_values += 1
 
         return n_values
@@ -2215,6 +2207,53 @@ cdef class CArrayBuilder:
         Error.raise_error_not_ok("ArrowArrayStartAppending()", code)
         return self
 
+    def append_strings(self, obj):
+        cdef int code
+        cdef Py_ssize_t item_utf8_size
+        cdef ArrowStringView item
+
+        for py_item in obj:
+            if py_item is None:
+                code = ArrowArrayAppendNull(self._ptr, 1)
+            else:
+                # Cython raises the error from PyUnicode_AsUTF8AndSize()
+                # in the event that py_item is not a str(); however, we
+                # set item_utf8_size = 0 to be safe.
+                item_utf8_size = 0
+                item.data = PyUnicode_AsUTF8AndSize(py_item, &item_utf8_size)
+                item.size_bytes = item_utf8_size
+                code = ArrowArrayAppendString(self._ptr, item)
+
+            if code != NANOARROW_OK:
+                Error.raise_error(f"append string item {py_item}")
+
+        return self
+
+    def append_bytes(self, obj):
+        cdef Py_buffer buffer
+        cdef ArrowBufferView item
+
+        for py_item in obj:
+            if py_item is None:
+                code = ArrowArrayAppendNull(self._ptr, 1)
+            else:
+                PyObject_GetBuffer(py_item, &buffer, PyBUF_ANY_CONTIGUOUS | 
PyBUF_FORMAT)
+
+                if buffer.ndim != 1:
+                    raise ValueError("Can't append buffer with dimensions != 1 
to binary array")
+
+                if buffer.itemsize != 1:
+                    PyBuffer_Release(&buffer)
+                    raise ValueError("Can't append buffer with itemsize != 1 
to binary array")
+
+                item.data.data = buffer.buf
+                item.size_bytes = buffer.len
+                code = ArrowArrayAppendBytes(self._ptr, item)
+                PyBuffer_Release(&buffer)
+
+            if code != NANOARROW_OK:
+                Error.raise_error(f"append bytes item {py_item}")
+
     def set_offset(self, int64_t offset):
         self.c_array._assert_valid()
         self._ptr.offset = offset
diff --git a/python/src/nanoarrow/c_lib.py b/python/src/nanoarrow/c_lib.py
index 3dd85d29..0cd5a734 100644
--- a/python/src/nanoarrow/c_lib.py
+++ b/python/src/nanoarrow/c_lib.py
@@ -628,6 +628,20 @@ def _c_array_from_iterable(obj, schema=None) -> CArray:
             f"Can't create array from iterable for type {schema_view.type}"
         )
 
+    # Handle variable-size binary types (string, binary)
+    if schema_view.type_id in (CArrowType.STRING, CArrowType.LARGE_STRING):
+        builder = CArrayBuilder.allocate()
+        builder.init_from_schema(schema)
+        builder.start_appending()
+        builder.append_strings(obj)
+        return builder.finish()
+    elif schema_view.type_id in (CArrowType.BINARY, CArrowType.LARGE_BINARY):
+        builder = CArrayBuilder.allocate()
+        builder.init_from_schema(schema)
+        builder.start_appending()
+        builder.append_bytes(obj)
+        return builder.finish()
+
     # Creating a buffer from an iterable does not handle None values,
     # but we can do so here with the NoneAwareWrapperIterator() wrapper.
     # This approach is quite a bit slower, so only do it for a nullable
@@ -655,6 +669,14 @@ def _c_array_from_iterable(obj, schema=None) -> CArray:
 def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
     import array
 
+    # array.typecodes is not available in all PyPy versions.
+    # Rather than guess, just don't use the array constructor if
+    # this attribute is not available.
+    if hasattr(array, "typecodes"):
+        array_typecodes = array.typecodes
+    else:
+        array_typecodes = []
+
     if schema is None:
         raise ValueError("CBuffer from iterable requires schema")
 
@@ -674,7 +696,7 @@ def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
     # If we are using a typecode supported by the array module, it has much
     # faster implementations of safely building buffers from iterables
     if (
-        builder.format in array.typecodes
+        builder.format in array_typecodes
         and schema_view.storage_type_id != CArrowType.BOOL
     ):
         buf = array.array(builder.format, obj)
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index 1511196f..91a7af98 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -207,7 +207,7 @@ def test_c_array_from_pybuffer_numpy():
 
 
 def test_c_array_from_iterable_empty():
-    empty_string = na.c_array([], na.c_schema(na.string()))
+    empty_string = na.c_array([], na.string())
     assert empty_string.length == 0
     assert empty_string.null_count == 0
     assert empty_string.offset == 0
@@ -219,6 +219,44 @@ def test_c_array_from_iterable_empty():
     assert len(array_view.buffer(2)) == 0
 
 
+def test_c_array_from_iterable_string():
+    string = na.c_array(["abc", None, "defg"], na.string())
+    assert string.length == 3
+    assert string.null_count == 1
+
+    array_view = na.c_array_view(string)
+    assert len(array_view.buffer(0)) == 1
+    assert len(array_view.buffer(1)) == 4
+    assert len(array_view.buffer(2)) == 7
+
+    # Check an item that is not a str()
+    with pytest.raises(TypeError):
+        na.c_array([b"1234"], na.string())
+
+
+def test_c_array_from_iterable_bytes():
+    string = na.c_array([b"abc", None, b"defg"], na.binary())
+    assert string.length == 3
+    assert string.null_count == 1
+
+    array_view = na.c_array_view(string)
+    assert len(array_view.buffer(0)) == 1
+    assert len(array_view.buffer(1)) == 4
+    assert len(array_view.buffer(2)) == 7
+
+    with pytest.raises(TypeError):
+        na.c_array(["1234"], na.binary())
+
+    buf_not_bytes = na.c_buffer([1, 2, 3], na.int32())
+    with pytest.raises(ValueError, match="Can't append buffer with itemsize != 
1"):
+        na.c_array([buf_not_bytes], na.binary())
+
+    np = pytest.importorskip("numpy")
+    buf_2d = np.ones((2, 2))
+    with pytest.raises(ValueError, match="Can't append buffer with dimensions 
!= 1"):
+        na.c_array([buf_2d], na.binary())
+
+
 def test_c_array_from_iterable_non_empty_nullable_without_nulls():
     c_array = na.c_array([1, 2, 3], na.int32())
     assert c_array.length == 3
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 38bb0c62..0c79de17 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -220,18 +220,31 @@ def test_c_buffer_builder():
     with pytest.raises(IndexError):
         builder.advance(114)
 
+
+def test_c_buffer_builder_buffer_protocol():
+    import platform
+
+    builder = CBufferBuilder()
+    builder.reserve_bytes(1)
+
     mv = memoryview(builder)
+    assert len(mv) == 1
+
     with pytest.raises(BufferError, match="CBufferBuilder is locked"):
         memoryview(builder)
 
     with pytest.raises(BufferError, match="CBufferBuilder is locked"):
         assert bytes(builder.finish()) == b"abcdefghij"
 
+    # On at least some versions of PyPy the call to mv.release() does not seem
+    # to deterministically call the CBufferBuilder's __releasebuffer__().
+    if platform.python_implementation() == "PyPy":
+        pytest.skip("CBufferBuilder buffer release is non-deterministic on 
PyPy")
+
     mv[builder.size_bytes] = ord("k")
     builder.advance(1)
-
-    del mv
-    assert bytes(builder.finish()) == b"abcdefghijk"
+    mv.release()
+    assert bytes(builder.finish()) == b"k"
 
 
 def test_c_buffer_from_iterable():
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 5b9c8103..0d07d31d 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -58,9 +58,7 @@ def test_iterator_nullable_primitive():
 
 
 def test_iterator_string():
-    array = na.c_array_from_buffers(
-        na.string(), 2, buffers=[None, na.c_buffer([0, 2, 5], na.int32()), 
b"abcde"]
-    )
+    array = na.c_array(["ab", "cde"], na.string())
 
     assert list(iter_py(array)) == ["ab", "cde"]
 
@@ -69,15 +67,7 @@ def test_iterator_string():
 
 
 def test_iterator_nullable_string():
-    array = na.c_array_from_buffers(
-        na.string(),
-        3,
-        buffers=[
-            na.c_buffer([1, 1, 0], na.bool()),
-            na.c_buffer([0, 2, 5, 5], na.int32()),
-            b"abcde",
-        ],
-    )
+    array = na.c_array(["ab", "cde", None], na.string())
 
     assert list(iter_py(array)) == ["ab", "cde", None]
 
@@ -86,9 +76,7 @@ def test_iterator_nullable_string():
 
 
 def test_iterator_binary():
-    array = na.c_array_from_buffers(
-        na.binary(), 2, buffers=[None, na.c_buffer([0, 2, 5], na.int32()), 
b"abcde"]
-    )
+    array = na.c_array([b"ab", b"cde"], na.binary())
 
     assert list(iter_py(array)) == [b"ab", b"cde"]
 
@@ -97,15 +85,7 @@ def test_iterator_binary():
 
 
 def test_iterator_nullable_binary():
-    array = na.c_array_from_buffers(
-        na.binary(),
-        3,
-        buffers=[
-            na.c_buffer([1, 1, 0], na.bool()),
-            na.c_buffer([0, 2, 5, 5], na.int32()),
-            b"abcde",
-        ],
-    )
+    array = na.c_array([b"ab", b"cde", None], na.binary())
 
     assert list(iter_py(array)) == [b"ab", b"cde", None]

(arrow-nanoarrow) branch main updated: feat(python): Create string/binary arrays from iterables (#430)

Reply via email to