This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 09481518 feat(python): Create string/binary arrays from iterables
(#430)
09481518 is described below
commit 094815189e43e4bb37bafef8dab124ed1533066b
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Apr 16 12:50:54 2024 -0300
feat(python): Create string/binary arrays from iterables (#430)
This PR adds support for building string and binary arrays via iterable.
It also cleans up a few parts of #426 that resulted in the wheel builds
failing for (at least) PyPy 3.8 and 3.9. We can circle back to the
performance of building from iterables (and whether or not `pack_into()`
is essential) when all the wheels are building reliably.
```python
import nanoarrow as na
strings = ["pizza", "yogurt", "noodles", "peanut butter sandwiches"]
na.Array(strings, na.string())
#> nanoarrow.Array<string>[4]
#> 'pizza'
#> 'yogurt'
#> 'noodles'
#> 'peanut butter sandwiches'
na.Array((s.encode() for s in strings), na.binary())
#> nanoarrow.Array<binary>[4]
#> b'pizza'
#> b'yogurt'
#> b'noodles'
#> b'peanut butter sandwiches'
```
The "build from iterable" code is now sufficiently complicated that it
should be separated out. I did an initial attempt at that for this PR;
however, it scrambles things up a bit and is complicated by the
interdependence between the functions that sanitize arguments (e.g.,
`c_schema()`, `c_array()`) and the functions that build from iterable.
Currently faster for strings and slightly slower for bytes than pyarrow.
```python
from itertools import cycle, islice
import nanoarrow as na
import pyarrow as pa
strings = ["pizza", "yogurt", "noodles", "peanut butter sandwiches"]
binary = [s.encode() for s in strings]
def many_strings():
return islice(cycle(strings), int(1e6))
def many_strings_with_nulls():
return islice(cycle(strings + [None]), int(1e6))
def many_bytes():
return islice(cycle(binary), int(1e6))
def many_bytes_with_nulls():
return islice(cycle(binary + [None]), int(1e6))
%timeit pa.array(many_strings(), pa.string())
#> 23.4 ms ± 488 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit na.c_array(many_strings(), na.string())
#> 14.3 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit pa.array(many_strings_with_nulls(), pa.string())
#> 21.4 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit na.c_array(many_strings_with_nulls(), na.string())
#> 17.1 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit pa.array(many_bytes(), pa.binary())
#> 19.7 ms ± 283 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit na.c_array(many_bytes(), na.binary())
#> 16.3 ms ± 136 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit pa.array(many_bytes_with_nulls(), pa.binary())
#> 17.6 ms ± 37.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit na.c_array(many_bytes_with_nulls(), na.binary())
#> 19 ms ± 378 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
---
python/bootstrap.py | 2 +-
python/src/nanoarrow/_lib.pyx | 65 ++++++++++++++++++++++++++++++++++---------
python/src/nanoarrow/c_lib.py | 24 +++++++++++++++-
python/tests/test_c_array.py | 40 +++++++++++++++++++++++++-
python/tests/test_c_buffer.py | 19 +++++++++++--
python/tests/test_iterator.py | 28 +++----------------
6 files changed, 135 insertions(+), 43 deletions(-)
diff --git a/python/bootstrap.py b/python/bootstrap.py
index 6395d128..f80c397c 100644
--- a/python/bootstrap.py
+++ b/python/bootstrap.py
@@ -23,7 +23,7 @@ import tempfile
import warnings
-# Generate the nanoarrow_c.pxd file used by the Cython extension
+# Generate the nanoarrow_c.pxd file used by the Cython extensions
class NanoarrowPxdGenerator:
def __init__(self):
self._define_regexes()
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 463f07ce..e215f01a 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -36,6 +36,7 @@ from libc.string cimport memcpy
from libc.stdio cimport snprintf
from cpython.bytes cimport PyBytes_FromStringAndSize
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer,
PyCapsule_IsValid
+from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cpython cimport (
Py_buffer,
PyObject_CheckBuffer,
@@ -1995,7 +1996,8 @@ cdef class CBufferBuilder:
cdef int code
struct_obj = Struct(self._buffer._format)
- pack_into = struct_obj.pack_into
+ pack = struct_obj.pack
+ write = self.write
# If an object has a length, we can avoid extra allocations
if hasattr(obj, "__len__"):
@@ -2009,22 +2011,12 @@ cdef class CBufferBuilder:
if self._buffer._data_type in (NANOARROW_TYPE_INTERVAL_DAY_TIME,
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO):
for item in obj:
- code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element)
- if code != NANOARROW_OK:
- Error.raise_error("ArrowBufferReserve()", code)
-
- pack_into(self, self._buffer._ptr.size_bytes, *item)
- self._buffer._ptr.size_bytes += bytes_per_element
+ write(pack(*item))
n_values += 1
else:
for item in obj:
- code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element)
- if code != NANOARROW_OK:
- Error.raise_error("ArrowBufferReserve()", code)
-
- pack_into(self, self._buffer._ptr.size_bytes, item)
- self._buffer._ptr.size_bytes += bytes_per_element
+ write(pack(item))
n_values += 1
return n_values
@@ -2215,6 +2207,53 @@ cdef class CArrayBuilder:
Error.raise_error_not_ok("ArrowArrayStartAppending()", code)
return self
+ def append_strings(self, obj):
+ cdef int code
+ cdef Py_ssize_t item_utf8_size
+ cdef ArrowStringView item
+
+ for py_item in obj:
+ if py_item is None:
+ code = ArrowArrayAppendNull(self._ptr, 1)
+ else:
+ # Cython raises the error from PyUnicode_AsUTF8AndSize()
+ # in the event that py_item is not a str(); however, we
+ # set item_utf8_size = 0 to be safe.
+ item_utf8_size = 0
+ item.data = PyUnicode_AsUTF8AndSize(py_item, &item_utf8_size)
+ item.size_bytes = item_utf8_size
+ code = ArrowArrayAppendString(self._ptr, item)
+
+ if code != NANOARROW_OK:
+ Error.raise_error(f"append string item {py_item}")
+
+ return self
+
+ def append_bytes(self, obj):
+ cdef Py_buffer buffer
+ cdef ArrowBufferView item
+
+ for py_item in obj:
+ if py_item is None:
+ code = ArrowArrayAppendNull(self._ptr, 1)
+ else:
+ PyObject_GetBuffer(py_item, &buffer, PyBUF_ANY_CONTIGUOUS |
PyBUF_FORMAT)
+
+ if buffer.ndim != 1:
+ raise ValueError("Can't append buffer with dimensions != 1
to binary array")
+
+ if buffer.itemsize != 1:
+ PyBuffer_Release(&buffer)
+ raise ValueError("Can't append buffer with itemsize != 1
to binary array")
+
+ item.data.data = buffer.buf
+ item.size_bytes = buffer.len
+ code = ArrowArrayAppendBytes(self._ptr, item)
+ PyBuffer_Release(&buffer)
+
+ if code != NANOARROW_OK:
+ Error.raise_error(f"append bytes item {py_item}")
+
def set_offset(self, int64_t offset):
self.c_array._assert_valid()
self._ptr.offset = offset
diff --git a/python/src/nanoarrow/c_lib.py b/python/src/nanoarrow/c_lib.py
index 3dd85d29..0cd5a734 100644
--- a/python/src/nanoarrow/c_lib.py
+++ b/python/src/nanoarrow/c_lib.py
@@ -628,6 +628,20 @@ def _c_array_from_iterable(obj, schema=None) -> CArray:
f"Can't create array from iterable for type {schema_view.type}"
)
+ # Handle variable-size binary types (string, binary)
+ if schema_view.type_id in (CArrowType.STRING, CArrowType.LARGE_STRING):
+ builder = CArrayBuilder.allocate()
+ builder.init_from_schema(schema)
+ builder.start_appending()
+ builder.append_strings(obj)
+ return builder.finish()
+ elif schema_view.type_id in (CArrowType.BINARY, CArrowType.LARGE_BINARY):
+ builder = CArrayBuilder.allocate()
+ builder.init_from_schema(schema)
+ builder.start_appending()
+ builder.append_bytes(obj)
+ return builder.finish()
+
# Creating a buffer from an iterable does not handle None values,
# but we can do so here with the NoneAwareWrapperIterator() wrapper.
# This approach is quite a bit slower, so only do it for a nullable
@@ -655,6 +669,14 @@ def _c_array_from_iterable(obj, schema=None) -> CArray:
def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
import array
+ # array.typecodes is not available in all PyPy versions.
+ # Rather than guess, just don't use the array constructor if
+ # this attribute is not available.
+ if hasattr(array, "typecodes"):
+ array_typecodes = array.typecodes
+ else:
+ array_typecodes = []
+
if schema is None:
raise ValueError("CBuffer from iterable requires schema")
@@ -674,7 +696,7 @@ def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
# If we are using a typecode supported by the array module, it has much
# faster implementations of safely building buffers from iterables
if (
- builder.format in array.typecodes
+ builder.format in array_typecodes
and schema_view.storage_type_id != CArrowType.BOOL
):
buf = array.array(builder.format, obj)
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index 1511196f..91a7af98 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -207,7 +207,7 @@ def test_c_array_from_pybuffer_numpy():
def test_c_array_from_iterable_empty():
- empty_string = na.c_array([], na.c_schema(na.string()))
+ empty_string = na.c_array([], na.string())
assert empty_string.length == 0
assert empty_string.null_count == 0
assert empty_string.offset == 0
@@ -219,6 +219,44 @@ def test_c_array_from_iterable_empty():
assert len(array_view.buffer(2)) == 0
+def test_c_array_from_iterable_string():
+ string = na.c_array(["abc", None, "defg"], na.string())
+ assert string.length == 3
+ assert string.null_count == 1
+
+ array_view = na.c_array_view(string)
+ assert len(array_view.buffer(0)) == 1
+ assert len(array_view.buffer(1)) == 4
+ assert len(array_view.buffer(2)) == 7
+
+ # Check an item that is not a str()
+ with pytest.raises(TypeError):
+ na.c_array([b"1234"], na.string())
+
+
+def test_c_array_from_iterable_bytes():
+ string = na.c_array([b"abc", None, b"defg"], na.binary())
+ assert string.length == 3
+ assert string.null_count == 1
+
+ array_view = na.c_array_view(string)
+ assert len(array_view.buffer(0)) == 1
+ assert len(array_view.buffer(1)) == 4
+ assert len(array_view.buffer(2)) == 7
+
+ with pytest.raises(TypeError):
+ na.c_array(["1234"], na.binary())
+
+ buf_not_bytes = na.c_buffer([1, 2, 3], na.int32())
+ with pytest.raises(ValueError, match="Can't append buffer with itemsize !=
1"):
+ na.c_array([buf_not_bytes], na.binary())
+
+ np = pytest.importorskip("numpy")
+ buf_2d = np.ones((2, 2))
+ with pytest.raises(ValueError, match="Can't append buffer with dimensions
!= 1"):
+ na.c_array([buf_2d], na.binary())
+
+
def test_c_array_from_iterable_non_empty_nullable_without_nulls():
c_array = na.c_array([1, 2, 3], na.int32())
assert c_array.length == 3
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 38bb0c62..0c79de17 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -220,18 +220,31 @@ def test_c_buffer_builder():
with pytest.raises(IndexError):
builder.advance(114)
+
+def test_c_buffer_builder_buffer_protocol():
+ import platform
+
+ builder = CBufferBuilder()
+ builder.reserve_bytes(1)
+
mv = memoryview(builder)
+ assert len(mv) == 1
+
with pytest.raises(BufferError, match="CBufferBuilder is locked"):
memoryview(builder)
with pytest.raises(BufferError, match="CBufferBuilder is locked"):
assert bytes(builder.finish()) == b"abcdefghij"
+ # On at least some versions of PyPy the call to mv.release() does not seem
+ # to deterministically call the CBufferBuilder's __releasebuffer__().
+ if platform.python_implementation() == "PyPy":
+ pytest.skip("CBufferBuilder buffer release is non-deterministic on
PyPy")
+
mv[builder.size_bytes] = ord("k")
builder.advance(1)
-
- del mv
- assert bytes(builder.finish()) == b"abcdefghijk"
+ mv.release()
+ assert bytes(builder.finish()) == b"k"
def test_c_buffer_from_iterable():
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 5b9c8103..0d07d31d 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -58,9 +58,7 @@ def test_iterator_nullable_primitive():
def test_iterator_string():
- array = na.c_array_from_buffers(
- na.string(), 2, buffers=[None, na.c_buffer([0, 2, 5], na.int32()),
b"abcde"]
- )
+ array = na.c_array(["ab", "cde"], na.string())
assert list(iter_py(array)) == ["ab", "cde"]
@@ -69,15 +67,7 @@ def test_iterator_string():
def test_iterator_nullable_string():
- array = na.c_array_from_buffers(
- na.string(),
- 3,
- buffers=[
- na.c_buffer([1, 1, 0], na.bool()),
- na.c_buffer([0, 2, 5, 5], na.int32()),
- b"abcde",
- ],
- )
+ array = na.c_array(["ab", "cde", None], na.string())
assert list(iter_py(array)) == ["ab", "cde", None]
@@ -86,9 +76,7 @@ def test_iterator_nullable_string():
def test_iterator_binary():
- array = na.c_array_from_buffers(
- na.binary(), 2, buffers=[None, na.c_buffer([0, 2, 5], na.int32()),
b"abcde"]
- )
+ array = na.c_array([b"ab", b"cde"], na.binary())
assert list(iter_py(array)) == [b"ab", b"cde"]
@@ -97,15 +85,7 @@ def test_iterator_binary():
def test_iterator_nullable_binary():
- array = na.c_array_from_buffers(
- na.binary(),
- 3,
- buffers=[
- na.c_buffer([1, 1, 0], na.bool()),
- na.c_buffer([0, 2, 5, 5], na.int32()),
- b"abcde",
- ],
- )
+ array = na.c_array([b"ab", b"cde", None], na.binary())
assert list(iter_py(array)) == [b"ab", b"cde", None]