This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 72a2e67b refactor(python): Document, prefix, and add reprs for
C-wrapping classes (#340)
72a2e67b is described below
commit 72a2e67bb3476a926026f128b622e89881204ae9
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Jan 11 19:17:24 2024 +0000
refactor(python): Document, prefix, and add reprs for C-wrapping classes
(#340)
This PR was inspired #319 but only addresses the first half (prefixes
C-wrapping classes so that the name `nanoarrow.array()` can be used for
a future class/constructor that more closely resembles a `pyarrow.Array`
or `numpy.Array`.
This PR does a few things:
- Uses capsules to manage allocate/cleanup of C resources instead of
"holder" objects. This eliminated some code and in theory makes it
possible to move some pieces out of Cython into C.
- Renames any "nanoarrow C library binding" classes to start with `C`
(e.g., `Schema` to `CSchema`). I made them slightly more literal as
well. Basically, these classes are about accessing the fields of the
structure without segfaulting. In a potential future world where we
don't use Cython, this is something like what we'd get with
auto-generated wrapper classes or thin C++ wrappers with generated
binding code.
- Opens the door for the user-facing versions of these: `Array`,
`Schema`, and an `ArrayStream`. The scope and design of those requires
more iteration than this PR allows and would benefit from some other
infrastructure to be in place first (e.g., convert to/from Python)
To make it a little more clear what the existing structures actually are
and what they can do, I added `repr()`s for them and updated the README.
Briefly:
```python
import nanoarrow as na
import pyarrow as pa
na.cschema(pa.int32())
#> <nanoarrow.clib.CSchema int32>
#> - format: 'i'
#> - name: ''
#> - flags: 2
#> - metadata: NULL
#> - dictionary: NULL
#> - children[0]:
na.cschema_view(pa.timestamp('s', "America/Halifax"))
#> <nanoarrow.clib.CSchemaView>
#> - type: 'timestamp'
#> - storage_type: 'int64'
#> - time_unit: 's'
#> - timezone: 'America/Halifax'
na.carray(pa.array([1, 2, 3]))
#> <nanoarrow.clib.CArray int64>
#> - length: 3
#> - offset: 0
#> - null_count: 0
#> - buffers: (0, 3354772373824)
#> - dictionary: NULL
#> - children[0]:
na.carray_view(pa.array([1, 2, 3]))
#> <nanoarrow.clib.CArrayView>
#> - storage_type: 'int64'
#> - length: 3
#> - offset: 0
#> - null_count: 0
#> - buffers[2]:
#> - <bool validity[0 b] >
#> - <int64 data[24 b] 1 2 3>
#> - dictionary: NULL
#> - children[0]:
pa_array_child = pa.array([1, 2, 3], pa.int32())
pa_array = pa.record_batch([pa_array_child], names=["some_column"])
reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
na.carray_stream(reader)
#> <nanoarrow.clib.CArrayStream>
#> - get_schema(): struct<some_column: int32>
```
This involved fixing the existing `BufferView` since to print their
contents in a repr-friendly way the elements had to be accessed. I think
the `BufferView` will see some changes but it does seem relatively
performant:
```python
import pyarrow as pa
import nanoarrow as na
import numpy as np
n = int(1e6)
pa_array = pa.array(np.random.random(n))
na_array_view = na.carray_view(pa_array)
%timeit pa_array.to_pylist()
#> 169 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit %timeit list(na_array_view.buffer(1))
#> 33.8 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
```
---------
Co-authored-by: Dane Pitkin <[email protected]>
---
.github/workflows/python.yaml | 3 +-
python/src/nanoarrow/__init__.py => .isort.cfg | 9 +-
python/README.ipynb | 236 ++++---
python/README.md | 175 +++--
python/src/nanoarrow/__init__.py | 15 +-
python/src/nanoarrow/_lib.pyx | 883 +++++++++++--------------
python/src/nanoarrow/_lib_utils.py | 149 ++++-
python/src/nanoarrow/c_lib.py | 307 +++++++++
python/src/nanoarrow/device.py | 10 +-
python/src/nanoarrow/lib.py | 82 ---
python/tests/test_capsules.py | 43 +-
python/tests/test_device.py | 4 +-
python/tests/test_nanoarrow.py | 380 +++++++----
13 files changed, 1437 insertions(+), 859 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 2f21379f..85f12c2e 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -61,8 +61,7 @@ jobs:
- name: Run doctests
if: success() && matrix.python-version == '3.10'
run: |
- pip install pytest-cython
- pytest --pyargs nanoarrow --doctest-cython
+ pytest --pyargs nanoarrow --doctest-modules
- name: Coverage
if: success() && matrix.python-version == '3.10'
diff --git a/python/src/nanoarrow/__init__.py b/.isort.cfg
similarity index 79%
copy from python/src/nanoarrow/__init__.py
copy to .isort.cfg
index 789f4597..614bf992 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/.isort.cfg
@@ -15,6 +15,9 @@
# specific language governing permissions and limitations
# under the License.
-from ._lib import Array, ArrayStream, ArrayView, Schema, c_version # noqa:
F401
-from .lib import array, array_stream, schema, array_view # noqa: F401
-from ._version import __version__ # noqa: F401
+[settings]
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_wrap = 0
+use_parentheses = True
+line_length = 88
diff --git a/python/README.ipynb b/python/README.ipynb
index d89d4c4a..153e1cbf 100644
--- a/python/README.ipynb
+++ b/python/README.ipynb
@@ -40,7 +40,7 @@
"URL (requires a C compiler):\n",
"\n",
"```bash\n",
- "python -m pip install
\"https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python\"\n",
+ "python -m pip install
\"git+https://github.com/apache/arrow-nanoarrow.git#egg=nanoarrow&subdirectory=python\"\n",
"```\n",
"\n",
"If you can import the namespace, you're good to go!"
@@ -60,23 +60,41 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Example\n",
+ "## Low-level C library bindings\n",
"\n",
- "The Arrow C Data and Arrow C Stream interfaces are comprised of three
structures: the `ArrowSchema` which represents a data type of an array, the
`ArrowArray` which represents the values of an array, and an
`ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common
`ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow
Python package.\n",
+ "The Arrow C Data and Arrow C Stream interfaces are comprised of three
structures: the `ArrowSchema` which represents a data type of an array, the
`ArrowArray` which represents the values of an array, and an
`ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common
`ArrowSchema`.\n",
"\n",
"### Schemas\n",
"\n",
- "Use `nanoarrow.schema()` to convert a data type-like object to an
`ArrowSchema`. This is currently only implemented for pyarrow objects."
+ "Use `nanoarrow.c_schema()` to convert an object to an `ArrowSchema` and
wrap it as a Python object. This works for any object implementing the [Arrow
PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
(e.g., `pyarrow.Schema`, `pyarrow.DataType`, and `pyarrow.Field`)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<nanoarrow.c_lib.CSchema decimal128(10, 3)>\n",
+ "- format: 'd:10,3'\n",
+ "- name: ''\n",
+ "- flags: 2\n",
+ "- metadata: NULL\n",
+ "- dictionary: NULL\n",
+ "- children[0]:"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"import pyarrow as pa\n",
- "schema = na.schema(pa.decimal128(10, 3))"
+ "schema = na.c_schema(pa.decimal128(10, 3))\n",
+ "schema"
]
},
{
@@ -84,7 +102,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can extract the fields of a `Schema` object one at a time or parse it
into a view to extract deserialized parameters."
+ "You can extract the fields of a `CSchema` object one at a time or parse
it into a view to extract deserialized parameters."
]
},
{
@@ -93,27 +111,30 @@
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "d:10,3\n",
- "10\n",
- "3\n"
- ]
+ "data": {
+ "text/plain": [
+ "<nanoarrow.c_lib.CSchemaView>\n",
+ "- type: 'decimal128'\n",
+ "- storage_type: 'decimal128'\n",
+ "- decimal_bitwidth: 128\n",
+ "- decimal_precision: 10\n",
+ "- decimal_scale: 3"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "print(schema.format)\n",
- "print(schema.view().decimal_precision)\n",
- "print(schema.view().decimal_scale)"
+ "na.c_schema_view(schema)"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "The `nanoarrow.schema()` helper is currently only implemented for pyarrow
objects. If your data type has an `_export_to_c()`-like function, you can get
the address of a freshly-allocated `ArrowSchema` as well:"
+ "Advanced users can allocate an empty `CSchema` and populate its contents
by passing its `._addr()` to a schema-exporting function."
]
},
{
@@ -124,7 +145,13 @@
{
"data": {
"text/plain": [
- "'int32'"
+ "<nanoarrow.c_lib.CSchema int32>\n",
+ "- format: 'i'\n",
+ "- name: ''\n",
+ "- flags: 2\n",
+ "- metadata: NULL\n",
+ "- dictionary: NULL\n",
+ "- children[0]:"
]
},
"execution_count": 4,
@@ -133,9 +160,9 @@
}
],
"source": [
- "schema = na.Schema.allocate()\n",
+ "schema = na.allocate_c_schema()\n",
"pa.int32()._export_to_c(schema._addr())\n",
- "schema.view().type"
+ "schema"
]
},
{
@@ -143,7 +170,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The `Schema` object cleans up after itself: when the object is deleted,
the underlying `Schema` is released."
+ "The `CSchema` object cleans up after itself: when the object is deleted,
the underlying `ArrowSchema` is released."
]
},
{
@@ -153,43 +180,34 @@
"source": [
"### Arrays\n",
"\n",
- "You can use `nanoarrow.array()` to convert an array-like object to a
`nanoarrow.Array`, optionally attaching a `Schema` that can be used to
interpret its contents. This is currently only implemented for pyarrow objects."
+ "You can use `nanoarrow.c_array()` to convert an array-like object to an
`ArrowArray`, wrap it as a Python object, and attach a schema that can be used
to interpret its contents. This works for any object implementing the [Arrow
PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
(e.g., `pyarrow.Array`, `pyarrow.RecordBatch`)."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
- "outputs": [],
- "source": [
- "array = na.array(pa.array([\"one\", \"two\", \"three\", None]))"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Like the `Schema`, you can inspect an `Array` by extracting fields
individually:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "4\n",
- "1\n"
- ]
+ "data": {
+ "text/plain": [
+ "<nanoarrow.c_lib.CArray string>\n",
+ "- length: 4\n",
+ "- offset: 0\n",
+ "- null_count: 1\n",
+ "- buffers: (2939032895680, 2939032895616, 2939032895744)\n",
+ "- dictionary: NULL\n",
+ "- children[0]:"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "print(array.length)\n",
- "print(array.null_count)"
+ "array = na.c_array(pa.array([\"one\", \"two\", \"three\", None]))\n",
+ "array"
]
},
{
@@ -197,32 +215,37 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "...and parse the `Array`/`Schema` combination into a view whose contents
is more readily accessible."
+ "You can extract the fields of a `CArray` one at a time or parse it into a
view to extract deserialized content:"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "[array([7], dtype=uint8),\n",
- " array([ 0, 3, 6, 11, 11], dtype=int32),\n",
- " array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e',
b'e'],\n",
- " dtype='|S1')]"
+ "<nanoarrow.c_lib.CArrayView>\n",
+ "- storage_type: 'string'\n",
+ "- length: 4\n",
+ "- offset: 0\n",
+ "- null_count: 1\n",
+ "- buffers[3]:\n",
+ " - <bool validity[1 b] 11100000>\n",
+ " - <int32 data_offset[20 b] 0 3 6 11 11>\n",
+ " - <string data[11 b] b'onetwothree'>\n",
+ "- dictionary: NULL\n",
+ "- children[0]:"
]
},
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "import numpy as np\n",
- "view = array.view()\n",
- "[np.array(buffer) for buffer in view.buffers]"
+ "na.c_array_view(array)"
]
},
{
@@ -230,12 +253,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Like the `Schema`, you can allocate an empty one and access its address
with `_addr()` to pass to other array-exporting functions."
+ "Like the `CSchema`, you can allocate an empty one and access its address
with `_addr()` to pass to other array-exporting functions."
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -244,13 +267,13 @@
"3"
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "array = na.Array.allocate(na.Schema.allocate())\n",
+ "array = na.allocate_c_array()\n",
"pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr())\n",
"array.length"
]
@@ -262,19 +285,45 @@
"source": [
"### Array streams\n",
"\n",
- "You can use `nanoarrow.array_stream()` to convert an object representing
a sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`.
This is currently only implemented for pyarrow objects."
+ "You can use `nanoarrow.c_array_stream()` to wrap an object representing a
sequence of `CArray`s with a common `CSchema` to an `ArrowArrayStream` and wrap
it as a Python object. This works for any object implementing the [Arrow
PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
(e.g., `pyarrow.RecordBatchReader`)."
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<nanoarrow.c_lib.CArrayStream>\n",
+ "- get_schema(): <nanoarrow.c_lib.CSchema struct>\n",
+ " - format: '+s'\n",
+ " - name: ''\n",
+ " - flags: 0\n",
+ " - metadata: NULL\n",
+ " - dictionary: NULL\n",
+ " - children[1]:\n",
+ " 'some_column': <nanoarrow.c_lib.CSchema int32>\n",
+ " - format: 'i'\n",
+ " - name: 'some_column'\n",
+ " - flags: 2\n",
+ " - metadata: NULL\n",
+ " - dictionary: NULL\n",
+ " - children[0]:"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pa_array_child = pa.array([1, 2, 3], pa.int32())\n",
"pa_array = pa.record_batch([pa_array_child], names=[\"some_column\"])\n",
"reader = pa.RecordBatchReader.from_batches(pa_array.schema,
[pa_array])\n",
- "array_stream = na.array_stream(reader)"
+ "array_stream = na.c_array_stream(reader)\n",
+ "array_stream"
]
},
{
@@ -282,31 +331,38 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can pull the next array from the stream using `.get_next()` or use it
like an interator. The `.get_next()` method will return `None` when there are
no more arrays in the stream."
+ "You can pull the next array from the stream using `.get_next()` or use it
like an iterator. The `.get_next()` method will raise `StopIteration` when
there are no more arrays in the stream."
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "struct<some_column: int32>\n",
- "3\n",
- "True\n"
+ "<nanoarrow.c_lib.CArray struct>\n",
+ "- length: 3\n",
+ "- offset: 0\n",
+ "- null_count: 0\n",
+ "- buffers: (0,)\n",
+ "- dictionary: NULL\n",
+ "- children[1]:\n",
+ " 'some_column': <nanoarrow.c_lib.CArray int32>\n",
+ " - length: 3\n",
+ " - offset: 0\n",
+ " - null_count: 0\n",
+ " - buffers: (0, 2939033026688)\n",
+ " - dictionary: NULL\n",
+ " - children[0]:\n"
]
}
],
"source": [
- "print(array_stream.get_schema())\n",
- "\n",
"for array in array_stream:\n",
- " print(array.length)\n",
- "\n",
- "print(array_stream.get_next() is None)"
+ " print(array)"
]
},
{
@@ -319,24 +375,38 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "struct<some_column: int32>"
+ "<nanoarrow.c_lib.CArrayStream>\n",
+ "- get_schema(): <nanoarrow.c_lib.CSchema struct>\n",
+ " - format: '+s'\n",
+ " - name: ''\n",
+ " - flags: 0\n",
+ " - metadata: NULL\n",
+ " - dictionary: NULL\n",
+ " - children[1]:\n",
+ " 'some_column': <nanoarrow.c_lib.CSchema int32>\n",
+ " - format: 'i'\n",
+ " - name: 'some_column'\n",
+ " - flags: 2\n",
+ " - metadata: NULL\n",
+ " - dictionary: NULL\n",
+ " - children[0]:"
]
},
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "array_stream = na.ArrayStream.allocate()\n",
+ "array_stream = na.allocate_c_array_stream()\n",
"reader._export_to_c(array_stream._addr())\n",
- "array_stream.get_schema()"
+ "array_stream"
]
},
{
@@ -383,7 +453,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.6"
+ "version": "3.11.4"
},
"orig_nbformat": 4
},
diff --git a/python/README.md b/python/README.md
index 2051180f..e5d4fec9 100644
--- a/python/README.md
+++ b/python/README.md
@@ -33,7 +33,7 @@ Python bindings for nanoarrow are not yet available on PyPI.
You can install via
URL (requires a C compiler):
```bash
-python -m pip install
"https://github.com/apache/arrow-nanoarrow/archive/refs/heads/main.zip#egg=nanoarrow&subdirectory=python"
+python -m pip install
"git+https://github.com/apache/arrow-nanoarrow.git#egg=nanoarrow&subdirectory=python"
```
If you can import the namespace, you're good to go!
@@ -43,97 +43,129 @@ If you can import the namespace, you're good to go!
import nanoarrow as na
```
-## Example
+## Low-level C library bindings
-The Arrow C Data and Arrow C Stream interfaces are comprised of three
structures: the `ArrowSchema` which represents a data type of an array, the
`ArrowArray` which represents the values of an array, and an
`ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common
`ArrowSchema`. All three can be wrapped by Python objects using the nanoarrow
Python package.
+The Arrow C Data and Arrow C Stream interfaces are comprised of three
structures: the `ArrowSchema` which represents a data type of an array, the
`ArrowArray` which represents the values of an array, and an
`ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common
`ArrowSchema`.
### Schemas
-Use `nanoarrow.schema()` to convert a data type-like object to an
`ArrowSchema`. This is currently only implemented for pyarrow objects.
+Use `nanoarrow.c_schema()` to convert an object to an `ArrowSchema` and wrap
it as a Python object. This works for any object implementing the [Arrow
PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
(e.g., `pyarrow.Schema`, `pyarrow.DataType`, and `pyarrow.Field`).
```python
import pyarrow as pa
-schema = na.schema(pa.decimal128(10, 3))
+schema = na.c_schema(pa.decimal128(10, 3))
+schema
```
-You can extract the fields of a `Schema` object one at a time or parse it into
a view to extract deserialized parameters.
+
+
+
+ <nanoarrow.c_lib.CSchema decimal128(10, 3)>
+ - format: 'd:10,3'
+ - name: ''
+ - flags: 2
+ - metadata: NULL
+ - dictionary: NULL
+ - children[0]:
+
+
+
+You can extract the fields of a `CSchema` object one at a time or parse it
into a view to extract deserialized parameters.
```python
-print(schema.format)
-print(schema.view().decimal_precision)
-print(schema.view().decimal_scale)
+na.c_schema_view(schema)
```
- d:10,3
- 10
- 3
-The `nanoarrow.schema()` helper is currently only implemented for pyarrow
objects. If your data type has an `_export_to_c()`-like function, you can get
the address of a freshly-allocated `ArrowSchema` as well:
+
+ <nanoarrow.c_lib.CSchemaView>
+ - type: 'decimal128'
+ - storage_type: 'decimal128'
+ - decimal_bitwidth: 128
+ - decimal_precision: 10
+ - decimal_scale: 3
+
+
+
+Advanced users can allocate an empty `CSchema` and populate its contents by
passing its `._addr()` to a schema-exporting function.
```python
-schema = na.Schema.allocate()
+schema = na.allocate_c_schema()
pa.int32()._export_to_c(schema._addr())
-schema.view().type
+schema
```
- 'int32'
+ <nanoarrow.c_lib.CSchema int32>
+ - format: 'i'
+ - name: ''
+ - flags: 2
+ - metadata: NULL
+ - dictionary: NULL
+ - children[0]:
-The `Schema` object cleans up after itself: when the object is deleted, the
underlying `Schema` is released.
+The `CSchema` object cleans up after itself: when the object is deleted, the
underlying `ArrowSchema` is released.
### Arrays
-You can use `nanoarrow.array()` to convert an array-like object to a
`nanoarrow.Array`, optionally attaching a `Schema` that can be used to
interpret its contents. This is currently only implemented for pyarrow objects.
+You can use `nanoarrow.c_array()` to convert an array-like object to an
`ArrowArray`, wrap it as a Python object, and attach a schema that can be used
to interpret its contents. This works for any object implementing the [Arrow
PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
(e.g., `pyarrow.Array`, `pyarrow.RecordBatch`).
```python
-array = na.array(pa.array(["one", "two", "three", None]))
+array = na.c_array(pa.array(["one", "two", "three", None]))
+array
```
-Like the `Schema`, you can inspect an `Array` by extracting fields
individually:
-```python
-print(array.length)
-print(array.null_count)
-```
- 4
- 1
+ <nanoarrow.c_lib.CArray string>
+ - length: 4
+ - offset: 0
+ - null_count: 1
+ - buffers: (2939032895680, 2939032895616, 2939032895744)
+ - dictionary: NULL
+ - children[0]:
+
-...and parse the `Array`/`Schema` combination into a view whose contents is
more readily accessible.
+You can extract the fields of a `CArray` one at a time or parse it into a view
to extract deserialized content:
```python
-import numpy as np
-view = array.view()
-[np.array(buffer) for buffer in view.buffers]
+na.c_array_view(array)
```
- [array([7], dtype=uint8),
- array([ 0, 3, 6, 11, 11], dtype=int32),
- array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
- dtype='|S1')]
+ <nanoarrow.c_lib.CArrayView>
+ - storage_type: 'string'
+ - length: 4
+ - offset: 0
+ - null_count: 1
+ - buffers[3]:
+ - <bool validity[1 b] 11100000>
+ - <int32 data_offset[20 b] 0 3 6 11 11>
+ - <string data[11 b] b'onetwothree'>
+ - dictionary: NULL
+ - children[0]:
-Like the `Schema`, you can allocate an empty one and access its address with
`_addr()` to pass to other array-exporting functions.
+Like the `CSchema`, you can allocate an empty one and access its address with
`_addr()` to pass to other array-exporting functions.
```python
-array = na.Array.allocate(na.Schema.allocate())
+array = na.allocate_c_array()
pa.array([1, 2, 3])._export_to_c(array._addr(), array.schema._addr())
array.length
```
@@ -147,46 +179,89 @@ array.length
### Array streams
-You can use `nanoarrow.array_stream()` to convert an object representing a
sequence of `Array`s with a common `Schema` to a `nanoarrow.ArrayStream`. This
is currently only implemented for pyarrow objects.
+You can use `nanoarrow.c_array_stream()` to wrap an object representing a
sequence of `CArray`s with a common `CSchema` to an `ArrowArrayStream` and wrap
it as a Python object. This works for any object implementing the [Arrow
PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
(e.g., `pyarrow.RecordBatchReader`).
```python
pa_array_child = pa.array([1, 2, 3], pa.int32())
pa_array = pa.record_batch([pa_array_child], names=["some_column"])
reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
-array_stream = na.array_stream(reader)
+array_stream = na.c_array_stream(reader)
+array_stream
```
-You can pull the next array from the stream using `.get_next()` or use it like
an iterator. The `.get_next()` method will return `None` when there are no more
arrays in the stream.
-```python
-print(array_stream.get_schema())
-for array in array_stream:
- print(array.length)
+ <nanoarrow.c_lib.CArrayStream>
+ - get_schema(): <nanoarrow.c_lib.CSchema struct>
+ - format: '+s'
+ - name: ''
+ - flags: 0
+ - metadata: NULL
+ - dictionary: NULL
+ - children[1]:
+ 'some_column': <nanoarrow.c_lib.CSchema int32>
+ - format: 'i'
+ - name: 'some_column'
+ - flags: 2
+ - metadata: NULL
+ - dictionary: NULL
+ - children[0]:
+
+
-print(array_stream.get_next() is None)
+You can pull the next array from the stream using `.get_next()` or use it like
an iterator. The `.get_next()` method will raise `StopIteration` when there are
no more arrays in the stream.
+
+
+```python
+for array in array_stream:
+ print(array)
```
- struct<some_column: int32>
- 3
- True
+ <nanoarrow.c_lib.CArray struct>
+ - length: 3
+ - offset: 0
+ - null_count: 0
+ - buffers: (0,)
+ - dictionary: NULL
+ - children[1]:
+ 'some_column': <nanoarrow.c_lib.CArray int32>
+ - length: 3
+ - offset: 0
+ - null_count: 0
+ - buffers: (0, 2939033026688)
+ - dictionary: NULL
+ - children[0]:
You can also get the address of a freshly-allocated stream to pass to a
suitable exporting function:
```python
-array_stream = na.ArrayStream.allocate()
+array_stream = na.allocate_c_array_stream()
reader._export_to_c(array_stream._addr())
-array_stream.get_schema()
+array_stream
```
- struct<some_column: int32>
+ <nanoarrow.c_lib.CArrayStream>
+ - get_schema(): <nanoarrow.c_lib.CSchema struct>
+ - format: '+s'
+ - name: ''
+ - flags: 0
+ - metadata: NULL
+ - dictionary: NULL
+ - children[1]:
+ 'some_column': <nanoarrow.c_lib.CSchema int32>
+ - format: 'i'
+ - name: 'some_column'
+ - flags: 2
+ - metadata: NULL
+ - dictionary: NULL
+ - children[0]:
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 789f4597..318493f8 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -15,6 +15,15 @@
# specific language governing permissions and limitations
# under the License.
-from ._lib import Array, ArrayStream, ArrayView, Schema, c_version # noqa:
F401
-from .lib import array, array_stream, schema, array_view # noqa: F401
-from ._version import __version__ # noqa: F401
+from nanoarrow._lib import c_version # noqa: F401
+from nanoarrow.c_lib import ( # noqa: F401
+ c_schema,
+ c_array,
+ c_array_stream,
+ c_schema_view,
+ c_array_view,
+ allocate_c_schema,
+ allocate_c_array,
+ allocate_c_array_stream,
+)
+from nanoarrow._version import __version__ # noqa: F401
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 2e7e6879..f77067c9 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -24,21 +24,25 @@ This Cython extension provides low-level Python wrappers
around the
Arrow C Data and Arrow C Stream interface structs. In general, there
is one wrapper per C struct and pointer validity is managed by keeping
strong references to Python objects. These wrappers are intended to
-be literal and stay close to the structure definitions.
+be literal and stay close to the structure definitions: higher level
+interfaces can and should be built in Python where it is faster to
+iterate and where it is easier to create a better user experience
+by default (i.e., classes, methods, and functions implemented in Python
+generally have better autocomplete + documentation available to IDEs).
"""
from libc.stdint cimport uintptr_t, int64_t
-from libc.stdlib cimport malloc, free
from libc.string cimport memcpy
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from libc.stdio cimport snprintf
from cpython.bytes cimport PyBytes_FromStringAndSize
-from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer,
PyCapsule_CheckExact
+from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
from cpython cimport Py_buffer
-from cpython.ref cimport PyObject, Py_INCREF, Py_DECREF
+from cpython.ref cimport Py_INCREF, Py_DECREF
from nanoarrow_c cimport *
from nanoarrow_device_c cimport *
-from nanoarrow._lib_utils import array_repr, device_array_repr, schema_repr,
device_repr
+from struct import unpack_from, iter_unpack
+from nanoarrow import _lib_utils
def c_version():
"""Return the nanoarrow C library version string
@@ -46,11 +50,12 @@ def c_version():
return ArrowNanoarrowVersion().decode("UTF-8")
+# PyCapsule utilities
#
-# PyCapsule export utilities
-#
-
-
+# PyCapsules are used (1) to safely manage memory associated with C structures
+# by initializing them and ensuring the appropriate cleanup is invoked when
+# the object is deleted; and (2) as an export mechanism conforming to the
+# Arrow PyCapsule interface for the objects where this is defined.
cdef void pycapsule_schema_deleter(object schema_capsule) noexcept:
cdef ArrowSchema* schema = <ArrowSchema*>PyCapsule_GetPointer(
schema_capsule, 'arrow_schema'
@@ -58,11 +63,11 @@ cdef void pycapsule_schema_deleter(object schema_capsule)
noexcept:
if schema.release != NULL:
ArrowSchemaRelease(schema)
- free(schema)
+ ArrowFree(schema)
cdef object alloc_c_schema(ArrowSchema** c_schema) noexcept:
- c_schema[0] = <ArrowSchema*> malloc(sizeof(ArrowSchema))
+ c_schema[0] = <ArrowSchema*> ArrowMalloc(sizeof(ArrowSchema))
# Ensure the capsule destructor doesn't call a random release pointer
c_schema[0].release = NULL
return PyCapsule_New(c_schema[0], 'arrow_schema',
&pycapsule_schema_deleter)
@@ -76,17 +81,17 @@ cdef void pycapsule_array_deleter(object array_capsule)
noexcept:
if array.release != NULL:
ArrowArrayRelease(array)
- free(array)
+ ArrowFree(array)
cdef object alloc_c_array(ArrowArray** c_array) noexcept:
- c_array[0] = <ArrowArray*> malloc(sizeof(ArrowArray))
+ c_array[0] = <ArrowArray*> ArrowMalloc(sizeof(ArrowArray))
# Ensure the capsule destructor doesn't call a random release pointer
c_array[0].release = NULL
return PyCapsule_New(c_array[0], 'arrow_array', &pycapsule_array_deleter)
-cdef void pycapsule_stream_deleter(object stream_capsule) noexcept:
+cdef void pycapsule_array_stream_deleter(object stream_capsule) noexcept:
cdef ArrowArrayStream* stream = <ArrowArrayStream*>PyCapsule_GetPointer(
stream_capsule, 'arrow_array_stream'
)
@@ -94,98 +99,81 @@ cdef void pycapsule_stream_deleter(object stream_capsule)
noexcept:
if stream.release != NULL:
ArrowArrayStreamRelease(stream)
- free(stream)
+ ArrowFree(stream)
-cdef object alloc_c_stream(ArrowArrayStream** c_stream) noexcept:
- c_stream[0] = <ArrowArrayStream*> malloc(sizeof(ArrowArrayStream))
+cdef object alloc_c_array_stream(ArrowArrayStream** c_stream) noexcept:
+ c_stream[0] = <ArrowArrayStream*> ArrowMalloc(sizeof(ArrowArrayStream))
# Ensure the capsule destructor doesn't call a random release pointer
c_stream[0].release = NULL
- return PyCapsule_New(c_stream[0], 'arrow_array_stream',
&pycapsule_stream_deleter)
+ return PyCapsule_New(c_stream[0], 'arrow_array_stream',
&pycapsule_array_stream_deleter)
-cdef void arrow_array_release(ArrowArray* array) noexcept with gil:
- Py_DECREF(<object>array.private_data)
- array.private_data = NULL
- array.release = NULL
+cdef void pycapsule_device_array_deleter(object device_array_capsule) noexcept:
+ cdef ArrowDeviceArray* device_array =
<ArrowDeviceArray*>PyCapsule_GetPointer(
+ device_array_capsule, 'arrow_device_array'
+ )
+ # Do not invoke the deleter on a used/moved capsule
+ if device_array.array.release != NULL:
+ device_array.array.release(&device_array.array)
+ ArrowFree(device_array)
-cdef class SchemaHolder:
- """Memory holder for an ArrowSchema
- This class is responsible for the lifecycle of the ArrowSchema
- whose memory it is responsible for. When this object is deleted,
- a non-NULL release callback is invoked.
- """
- cdef ArrowSchema c_schema
+cdef object alloc_c_device_array(ArrowDeviceArray** c_device_array) noexcept:
+ c_device_array[0] = <ArrowDeviceArray*>
ArrowMalloc(sizeof(ArrowDeviceArray))
+ # Ensure the capsule destructor doesn't call a random release pointer
+ c_device_array[0].array.release = NULL
+ return PyCapsule_New(c_device_array[0], 'arrow_device_array',
&pycapsule_device_array_deleter)
- def __cinit__(self):
- self.c_schema.release = NULL
- def __dealloc__(self):
- if self.c_schema.release != NULL:
- ArrowSchemaRelease(&self.c_schema)
+cdef void pycapsule_array_view_deleter(object array_capsule) noexcept:
+ cdef ArrowArrayView* array_view = <ArrowArrayView*>PyCapsule_GetPointer(
+ array_capsule, 'nanoarrow_array_view'
+ )
- def _addr(self):
- return <uintptr_t>&self.c_schema
+ ArrowArrayViewReset(array_view)
+ ArrowFree(array_view)
-cdef class ArrayHolder:
- """Memory holder for an ArrowArray
- This class is responsible for the lifecycle of the ArrowArray
- whose memory it is responsible. When this object is deleted,
- a non-NULL release callback is invoked.
- """
- cdef ArrowArray c_array
+cdef object alloc_c_array_view(ArrowArrayView** c_array_view) noexcept:
+ c_array_view[0] = <ArrowArrayView*> ArrowMalloc(sizeof(ArrowArrayView))
+ ArrowArrayViewInitFromType(c_array_view[0], NANOARROW_TYPE_UNINITIALIZED)
+ return PyCapsule_New(c_array_view[0], 'nanoarrow_array_view',
&pycapsule_array_view_deleter)
- def __cinit__(self):
- self.c_array.release = NULL
- def __dealloc__(self):
- if self.c_array.release != NULL:
- ArrowArrayRelease(&self.c_array)
+cdef void arrow_array_release(ArrowArray* array) noexcept with gil:
+ Py_DECREF(<object>array.private_data)
+ array.private_data = NULL
+ array.release = NULL
- def _addr(self):
- return <uintptr_t>&self.c_array
-cdef class ArrayStreamHolder:
- """Memory holder for an ArrowArrayStream
+cdef object alloc_c_array_shallow_copy(object base, const ArrowArray* c_array)
noexcept:
+ """Make a shallow copy of an ArrowArray
- This class is responsible for the lifecycle of the ArrowArrayStream
- whose memory it is responsible. When this object is deleted,
- a non-NULL release callback is invoked.
+ To more safely implement export of an ArrowArray whose address may be
+ depended on by some other Python object, we implement a shallow copy
+ whose constructor calls Py_INCREF() on a Python object responsible
+ for the ArrowArray's lifecycle and whose deleter calls Py_DECREF() on
+ the same object.
"""
- cdef ArrowArrayStream c_array_stream
+ cdef:
+ ArrowArray* c_array_out
- def __cinit__(self):
- self.c_array_stream.release = NULL
+ array_capsule = alloc_c_array(&c_array_out)
- def __dealloc__(self):
- if self.c_array_stream.release != NULL:
- ArrowArrayStreamRelease(&self.c_array_stream)
+ # shallow copy
+ memcpy(c_array_out, c_array, sizeof(ArrowArray))
+ c_array_out.release = NULL
+ c_array_out.private_data = NULL
- def _addr(self):
- return <uintptr_t>&self.c_array_stream
+ # track original base
+ c_array_out.private_data = <void*>base
+ Py_INCREF(base)
+ c_array_out.release = arrow_array_release
-
-cdef class ArrayViewHolder:
- """Memory holder for an ArrowArrayView
-
- This class is responsible for the lifecycle of the ArrowArrayView
- whose memory it is responsible. When this object is deleted,
- ArrowArrayViewReset() is called on the contents.
- """
- cdef ArrowArrayView c_array_view
-
- def __cinit__(self):
- ArrowArrayViewInitFromType(&self.c_array_view,
NANOARROW_TYPE_UNINITIALIZED)
-
- def __dealloc__(self):
- ArrowArrayViewReset(&self.c_array_view)
-
- def _addr(self):
- return <uintptr_t>&self.c_array_view
+ return array_capsule
class NanoarrowException(RuntimeError):
@@ -233,39 +221,26 @@ cdef class Error:
raise NanoarrowException(what, code, "")
-cdef class Schema:
- """ArrowSchema wrapper
-
- This class provides a user-facing interface to access the fields of
- an ArrowSchema as defined in the Arrow C Data interface. These objects
- are usually created using `nanoarrow.schema()`. This Python wrapper
- allows access to schema fields but does not automatically deserialize
- their content: use `.view()` to validate and deserialize the content
- into a more easily inspectable object.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> schema = na.schema(pa.int32())
- >>> schema.is_valid()
- True
- >>> schema.format
- 'i'
- >>> schema.name
- ''
- >>> schema_view = schema.view()
- >>> schema_view.type
- 'int32'
+cdef class CSchema:
+ """Low-level ArrowSchema wrapper
+
+ This object is a literal wrapper around a read-only ArrowSchema. It
provides field accessors
+ that return Python objects and handles the C Data interface lifecycle
(i.e., initialized
+ ArrowSchema structures are always released).
+
+ See `nanoarrow.c_schema()` for construction and usage examples.
"""
+ # Currently, _base is always the capsule holding the root of a tree of
ArrowSchemas
+ # (but in general is just a strong reference to an object whose Python
lifetime is
+ # used to guarantee that _ptr is valid).
cdef object _base
cdef ArrowSchema* _ptr
@staticmethod
def allocate():
- base = SchemaHolder()
- return Schema(base, base._addr())
+ cdef ArrowSchema* c_schema_out
+ base = alloc_c_schema(&c_schema_out)
+ return CSchema(base, <uintptr_t>(c_schema_out))
def __cinit__(self, object base, uintptr_t addr):
self._base = base,
@@ -282,7 +257,7 @@ cdef class Schema:
A valid PyCapsule with name 'arrow_schema' containing an
ArrowSchema pointer.
"""
- return Schema(
+ return CSchema(
schema_capsule,
<uintptr_t>PyCapsule_GetPointer(schema_capsule, 'arrow_schema')
)
@@ -315,20 +290,25 @@ cdef class Schema:
if self._ptr.release == NULL:
raise RuntimeError("schema is released")
- def _to_string(self, recursive=False):
- cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0,
recursive)
- cdef char* out = <char*>PyMem_Malloc(n_chars + 1)
+ def _to_string(self, int64_t max_chars=0, recursive=False):
+ cdef int64_t n_chars
+ if max_chars == 0:
+ n_chars = ArrowSchemaToString(self._ptr, NULL, 0, recursive)
+ else:
+ n_chars = max_chars
+
+ cdef char* out = <char*>ArrowMalloc(n_chars + 1)
if not out:
raise MemoryError()
ArrowSchemaToString(self._ptr, out, n_chars + 1, recursive)
out_str = out.decode("UTF-8")
- PyMem_Free(out)
+ ArrowFree(out)
return out_str
def __repr__(self):
- return schema_repr(self)
+ return _lib_utils.schema_repr(self)
@property
def format(self):
@@ -352,57 +332,49 @@ cdef class Schema:
def metadata(self):
self._assert_valid()
if self._ptr.metadata != NULL:
- return SchemaMetadata(self, <uintptr_t>self._ptr.metadata)
+ return SchemaMetadata(self._base, <uintptr_t>self._ptr.metadata)
else:
return None
@property
- def children(self):
+ def n_children(self):
self._assert_valid()
- return SchemaChildren(self)
+ return self._ptr.n_children
+
+ def child(self, int64_t i):
+ self._assert_valid()
+ if i < 0 or i >= self._ptr.n_children:
+ raise IndexError(f"{i} out of range [0, {self._ptr.n_children})")
+
+ return CSchema(self._base, <uintptr_t>self._ptr.children[i])
+
+ @property
+ def children(self):
+ for i in range(self.n_children):
+ yield self.child(i)
@property
def dictionary(self):
self._assert_valid()
if self._ptr.dictionary != NULL:
- return Schema(self, <uintptr_t>self._ptr.dictionary)
+ return CSchema(self, <uintptr_t>self._ptr.dictionary)
else:
return None
- def view(self):
- self._assert_valid()
- schema_view = SchemaView()
- cdef Error error = Error()
- cdef int result = ArrowSchemaViewInit(&schema_view._schema_view,
self._ptr, &error.c_error)
- if result != NANOARROW_OK:
- error.raise_message("ArrowSchemaViewInit()", result)
-
- return schema_view
+cdef class CSchemaView:
+ """Low-level ArrowSchemaView wrapper
-cdef class SchemaView:
- """ArrowSchemaView wrapper
+ This object is a literal wrapper around a read-only ArrowSchemaView. It
provides field accessors
+ that return Python objects and handles structure lifecycle. Compared to an
ArrowSchema,
+ the nanoarrow ArrowSchemaView facilitates access to the deserialized
content of an ArrowSchema
+ (e.g., parameter values for parameterized types).
- The ArrowSchemaView is a nanoarrow C library structure that facilitates
- access to the deserialized content of an ArrowSchema (e.g., parameter
- values for parameterized types). This wrapper extends that facility to
Python.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> schema = na.schema(pa.decimal128(10, 3))
- >>> schema_view = schema.view()
- >>> schema_view.type
- 'decimal128'
- >>> schema_view.decimal_bitwidth
- 128
- >>> schema_view.decimal_precision
- 10
- >>> schema_view.decimal_scale
- 3
+ See `nanoarrow.c_schema_view()` for construction and usage examples.
"""
+ # _base is currently only a CSchema (but in general is just an object
whose Python
+ # lifetime guarantees that the pointed-to data from ArrowStringViews
remains valid
+ cdef object _base
cdef ArrowSchemaView _schema_view
_fixed_size_types = (
@@ -427,10 +399,16 @@ cdef class SchemaView:
NANOARROW_TYPE_SPARSE_UNION
)
- def __cinit__(self):
+ def __cinit__(self, CSchema schema):
+ self._base = schema
self._schema_view.type = NANOARROW_TYPE_UNINITIALIZED
self._schema_view.storage_type = NANOARROW_TYPE_UNINITIALIZED
+ cdef Error error = Error()
+ cdef int result = ArrowSchemaViewInit(&self._schema_view, schema._ptr,
&error.c_error)
+ if result != NANOARROW_OK:
+ error.raise_message("ArrowSchemaViewInit()", result)
+
@property
def type(self):
cdef const char* type_str = ArrowTypeString(self._schema_view.type)
@@ -445,27 +423,27 @@ cdef class SchemaView:
@property
def fixed_size(self):
- if self._schema_view.type in SchemaView._fixed_size_types:
+ if self._schema_view.type in CSchemaView._fixed_size_types:
return self._schema_view.fixed_size
@property
def decimal_bitwidth(self):
- if self._schema_view.type in SchemaView._decimal_types:
+ if self._schema_view.type in CSchemaView._decimal_types:
return self._schema_view.decimal_bitwidth
@property
def decimal_precision(self):
- if self._schema_view.type in SchemaView._decimal_types:
+ if self._schema_view.type in CSchemaView._decimal_types:
return self._schema_view.decimal_precision
@property
def decimal_scale(self):
- if self._schema_view.type in SchemaView._decimal_types:
+ if self._schema_view.type in CSchemaView._decimal_types:
return self._schema_view.decimal_scale
@property
def time_unit(self):
- if self._schema_view.type in SchemaView._time_unit_types:
+ if self._schema_view.type in CSchemaView._time_unit_types:
return
ArrowTimeUnitString(self._schema_view.time_unit).decode('UTF-8')
@property
@@ -475,7 +453,7 @@ cdef class SchemaView:
@property
def union_type_ids(self):
- if self._schema_view.type in SchemaView._union_types:
+ if self._schema_view.type in CSchemaView._union_types:
type_ids_str =
self._schema_view.union_type_ids.decode('UTF-8').split(',')
return (int(type_id) for type_id in type_ids_str)
@@ -496,41 +474,30 @@ cdef class SchemaView:
self._schema_view.extension_metadata.size_bytes
)
-cdef class Array:
- """ArrowArray wrapper
-
- This class provides a user-facing interface to access the fields of
- an ArrowArray as defined in the Arrow C Data interface, holding an
- optional reference to a Schema that can be used to safely deserialize
- the content. These objects are usually created using `nanoarrow.array()`.
- This Python wrapper allows access to array fields but does not
- automatically deserialize their content: use `nanoarrow.array_view()`
- to validate and deserialize the content into a more easily inspectable
- object.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import numpy as np
- >>> import nanoarrow as na
- >>> array = na.array(pa.array(["one", "two", "three", None]))
- >>> array.length
- 4
- >>> array.null_count
- 1
- >>> array_view = na.array_view(array)
+
+ def __repr__(self):
+ return _lib_utils.schema_view_repr(self)
+
+cdef class CArray:
+ """Low-level ArrowArray wrapper
+
+ This object is a literal wrapper around a read-only ArrowArray. It
provides field accessors
+ that return Python objects and handles the C Data interface lifecycle
(i.e., initialized
+ ArrowArray structures are always released).
+
+ See `nanoarrow.c_array()` for construction and usage examples.
"""
cdef object _base
cdef ArrowArray* _ptr
- cdef Schema _schema
+ cdef CSchema _schema
@staticmethod
- def allocate(Schema schema):
- base = ArrayHolder()
- return Array(base, base._addr(), schema)
+ def allocate(CSchema schema):
+ cdef ArrowArray* c_array_out
+ base = alloc_c_array(&c_array_out)
+ return CArray(base, <uintptr_t>c_array_out, schema)
- def __cinit__(self, object base, uintptr_t addr, Schema schema):
+ def __cinit__(self, object base, uintptr_t addr, CSchema schema):
self._base = base
self._ptr = <ArrowArray*>addr
self._schema = schema
@@ -550,11 +517,11 @@ cdef class Array:
ArrowArray pointer.
"""
cdef:
- Schema out_schema
- Array out
+ CSchema out_schema
+ CArray out
- out_schema = Schema._import_from_c_capsule(schema_capsule)
- out = Array(
+ out_schema = CSchema._import_from_c_capsule(schema_capsule)
+ out = CArray(
array_capsule,
<uintptr_t>PyCapsule_GetPointer(array_capsule, 'arrow_array'),
out_schema
@@ -582,24 +549,11 @@ cdef class Array:
if requested_schema is not None:
raise NotImplementedError("requested_schema")
+ # Export a shallow copy pointing to the same data in a way
+ # that ensures this object stays valid.
# TODO optimize this to export a version where children are reference
# counted and can be released separately
-
- cdef:
- ArrowArray* c_array_out
-
- array_capsule = alloc_c_array(&c_array_out)
-
- # shallow copy
- memcpy(c_array_out, self._ptr, sizeof(ArrowArray))
- c_array_out.release = NULL
- c_array_out.private_data = NULL
-
- # track original base
- c_array_out.private_data = <void*>self._base
- Py_INCREF(self._base)
- c_array_out.release = arrow_array_release
-
+ array_capsule = alloc_c_array_shallow_copy(self._base, self._ptr)
return self._schema.__arrow_c_schema__(), array_capsule
def _addr(self):
@@ -610,9 +564,9 @@ cdef class Array:
def _assert_valid(self):
if self._ptr == NULL:
- raise RuntimeError("Array is NULL")
+ raise RuntimeError("CArray is NULL")
if self._ptr.release == NULL:
- raise RuntimeError("Array is released")
+ raise RuntimeError("CArray is released")
@property
def schema(self):
@@ -630,65 +584,71 @@ cdef class Array:
@property
def null_count(self):
+ self._assert_valid()
return self._ptr.null_count
+ @property
+ def n_buffers(self):
+ self._assert_valid()
+ return self._ptr.n_buffers
+
@property
def buffers(self):
+ self._assert_valid()
return tuple(<uintptr_t>self._ptr.buffers[i] for i in
range(self._ptr.n_buffers))
+ @property
+ def n_children(self):
+ self._assert_valid()
+ return self._ptr.n_children
+
+ def child(self, int64_t i):
+ self._assert_valid()
+ if i < 0 or i >= self._ptr.n_children:
+ raise IndexError(f"{i} out of range [0, {self._ptr.n_children})")
+ return CArray(self._base, <uintptr_t>self._ptr.children[i],
self._schema.child(i))
+
@property
def children(self):
- return ArrayChildren(self)
+ for i in range(self.n_children):
+ yield self.child(i)
@property
def dictionary(self):
self._assert_valid()
if self._ptr.dictionary != NULL:
- return Array(self, <uintptr_t>self._ptr.dictionary,
self._schema.dictionary)
+ return CArray(self, <uintptr_t>self._ptr.dictionary,
self._schema.dictionary)
else:
return None
def __repr__(self):
- return array_repr(self)
-
-
-cdef class ArrayView:
- """ArrowArrayView wrapper
-
- The ArrowArrayView is a nanoarrow C library structure that provides
- structured access to buffers addresses, buffer sizes, and buffer
- data types. The buffer data is usually propagated from an ArrowArray
- but can also be propagated from other types of objects (e.g., serialized
- IPC). The offset and length of this view are independent of its parent
- (i.e., this object can also represent a slice of its parent).
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import numpy as np
- >>> import nanoarrow as na
- >>> array = na.array(pa.array(["one", "two", "three", None]))
- >>> array_view = na.array_view(array)
- >>> np.array(array_view.buffers[1])
- array([ 0, 3, 6, 11, 11], dtype=int32)
- >>> np.array(array_view.buffers[2])
- array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
- dtype='|S1')
+ return _lib_utils.array_repr(self)
+
+
+cdef class CArrayView:
+ """Low-level ArrowArrayView wrapper
+
+ This object is a literal wrapper around an ArrowArrayView. It provides
field accessors
+ that return Python objects and handles the structure lifecycle (i.e.,
initialized
+ ArrowArrayView structures are always released).
+
+ See `nanoarrow.c_array_view()` for construction and usage examples.
"""
cdef object _base
cdef ArrowArrayView* _ptr
cdef ArrowDevice* _device
- cdef Schema _schema
- cdef object _base_buffer
- def __cinit__(self, object base, uintptr_t addr, Schema schema, object
base_buffer):
+ def __cinit__(self, object base, uintptr_t addr):
self._base = base
self._ptr = <ArrowArrayView*>addr
- self._schema = schema
- self._base_buffer = base_buffer
self._device = ArrowDeviceCpu()
+ @property
+ def storage_type(self):
+ cdef const char* type_str = ArrowTypeString(self._ptr.storage_type)
+ if type_str != NULL:
+ return type_str.decode('UTF-8')
+
@property
def length(self):
return self._ptr.length
@@ -701,87 +661,94 @@ cdef class ArrayView:
def null_count(self):
return self._ptr.null_count
+ @property
+ def n_children(self):
+ return self._ptr.n_children
+
+ def child(self, int64_t i):
+ if i < 0 or i >= self._ptr.n_children:
+ raise IndexError(f"{i} out of range [0, {self._ptr.n_children})")
+
+ cdef CArrayView child = CArrayView(
+ self._base,
+ <uintptr_t>self._ptr.children[i]
+ )
+
+ child._device = self._device
+ return child
+
@property
def children(self):
- return ArrayViewChildren(self)
+ for i in range(self.n_children):
+ yield self.child(i)
+
+ @property
+ def n_buffers(self):
+ for i in range(3):
+ if self._ptr.layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE:
+ return i
+ return 3
+
+ def buffer(self, int64_t i):
+ if i < 0 or i >= self.n_buffers:
+ raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
+
+ cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i])
+ return CBufferView(
+ self._base,
+ <uintptr_t>buffer_view,
+ self._ptr.layout.buffer_type[i],
+ self._ptr.layout.buffer_data_type[i],
+ self._ptr.layout.element_size_bits[i],
+ <uintptr_t>self._device
+ )
@property
def buffers(self):
- return ArrayViewBuffers(self)
+ for i in range(self.n_buffers):
+ yield self.buffer(i)
@property
def dictionary(self):
if self._ptr.dictionary == NULL:
return None
else:
- return ArrayView(
+ return CArrayView(
self,
- <uintptr_t>self._ptr.dictionary,
- self._schema.dictionary,
- None
+ <uintptr_t>self._ptr.dictionary
)
- @property
- def schema(self):
- return self._schema
-
- def _assert_cpu(self):
- if self._device.device_type != ARROW_DEVICE_CPU:
- raise RuntimeError("ArrayView is not representing a CPU device")
+ def __repr__(self):
+ return _lib_utils.array_view_repr(self)
@staticmethod
- def from_cpu_array(Array array):
- cdef ArrayViewHolder holder = ArrayViewHolder()
+ def from_cpu_array(CArray array):
+ cdef ArrowArrayView* c_array_view
+ base = alloc_c_array_view(&c_array_view)
cdef Error error = Error()
- cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view,
+ cdef int result = ArrowArrayViewInitFromSchema(c_array_view,
array._schema._ptr,
&error.c_error)
if result != NANOARROW_OK:
error.raise_message("ArrowArrayViewInitFromSchema()", result)
- result = ArrowArrayViewSetArray(&holder.c_array_view, array._ptr,
&error.c_error)
+ result = ArrowArrayViewSetArray(c_array_view, array._ptr,
&error.c_error)
if result != NANOARROW_OK:
error.raise_message("ArrowArrayViewSetArray()", result)
- return ArrayView(holder, holder._addr(), array._schema, array)
-
-
-cdef class SchemaChildren:
- """Wrapper for a lazily-resolved list of Schema children
- """
- cdef Schema _parent
- cdef int64_t _length
-
- def __cinit__(self, Schema parent):
- self._parent = parent
- self._length = parent._ptr.n_children
-
- def __len__(self):
- return self._length
-
- def __getitem__(self, k):
- k = int(k)
- if k < 0 or k >= self._length:
- raise IndexError(f"{k} out of range [0, {self._length})")
-
- return Schema(self._parent, self._child_addr(k))
-
- cdef _child_addr(self, int64_t i):
- cdef ArrowSchema** children = self._parent._ptr.children
- cdef ArrowSchema* child = children[i]
- return <uintptr_t>child
+ return CArrayView((base, array), <uintptr_t>c_array_view)
cdef class SchemaMetadata:
- """Wrapper for a lazily-parsed Schema.metadata string
+ """Wrapper for a lazily-parsed CSchema.metadata string
"""
- cdef object _parent
+ cdef object _base
cdef const char* _metadata
cdef ArrowMetadataReader _reader
- def __cinit__(self, object parent, uintptr_t ptr):
- self._parent = parent
+ def __cinit__(self, object base, uintptr_t ptr):
+ self._base = base
self._metadata = <const char*>ptr
def _init_reader(self):
@@ -804,65 +771,7 @@ cdef class SchemaMetadata:
yield key_obj, value_obj
-cdef class ArrayChildren:
- """Wrapper for a lazily-resolved list of Array children
- """
- cdef Array _parent
- cdef int64_t _length
-
- def __cinit__(self, Array parent):
- self._parent = parent
- self._length = parent._ptr.n_children
-
- def __len__(self):
- return self._length
-
- def __getitem__(self, k):
- k = int(k)
- if k < 0 or k >= self._length:
- raise IndexError(f"{k} out of range [0, {self._length})")
- return Array(self._parent, self._child_addr(k),
self._parent.schema.children[k])
-
- cdef _child_addr(self, int64_t i):
- cdef ArrowArray** children = self._parent._ptr.children
- cdef ArrowArray* child = children[i]
- return <uintptr_t>child
-
-
-cdef class ArrayViewChildren:
- """Wrapper for a lazily-resolved list of ArrayView children
- """
- cdef ArrayView _parent
- cdef int64_t _length
-
- def __cinit__(self, ArrayView parent):
- self._parent = parent
- self._length = parent._ptr.n_children
-
- def __len__(self):
- return self._length
-
- def __getitem__(self, k):
- k = int(k)
- if k < 0 or k >= self._length:
- raise IndexError(f"{k} out of range [0, {self._length})")
- cdef ArrayView child = ArrayView(
- self._parent,
- self._child_addr(k),
- self._parent._schema.children[k],
- None
- )
-
- child._device = self._parent._device
- return child
-
- cdef _child_addr(self, int64_t i):
- cdef ArrowArrayView** children = self._parent._ptr.children
- cdef ArrowArrayView* child = children[i]
- return <uintptr_t>child
-
-
-cdef class BufferView:
+cdef class CBufferView:
"""Wrapper for Array buffer content
This object is a Python wrapper around a buffer held by an Array.
@@ -878,10 +787,11 @@ cdef class BufferView:
cdef Py_ssize_t _element_size_bits
cdef Py_ssize_t _shape
cdef Py_ssize_t _strides
+ cdef char _format[128]
def __cinit__(self, object base, uintptr_t addr,
- ArrowBufferType buffer_type, ArrowType buffer_data_type,
- Py_ssize_t element_size_bits, uintptr_t device):
+ ArrowBufferType buffer_type, ArrowType buffer_data_type,
+ Py_ssize_t element_size_bits, uintptr_t device):
self._base = base
self._ptr = <ArrowBufferView*>addr
self._buffer_type = buffer_type
@@ -890,50 +800,129 @@ cdef class BufferView:
self._element_size_bits = element_size_bits
self._strides = self._item_size()
self._shape = self._ptr.size_bytes // self._strides
+ self._format[0] = 0
+ self._populate_format()
+
+ def _addr(self):
+ return <uintptr_t>self._ptr.data.data
+ @property
+ def device_type(self):
+ return self._device.device_type
+
+ @property
+ def device_id(self):
+ return self._device.device_id
+
+ @property
+ def element_size_bits(self):
+ return self._element_size_bits
+
+ @property
+ def size_bytes(self):
+ return self._ptr.size_bytes
+
+ @property
+ def type(self):
+ if self._buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
+ return "validity"
+ elif self._buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
+ return "type_id"
+ elif self._buffer_type == NANOARROW_BUFFER_TYPE_UNION_OFFSET:
+ return "union_offset"
+ elif self._buffer_type == NANOARROW_BUFFER_TYPE_DATA_OFFSET:
+ return "data_offset"
+ elif self._buffer_type == NANOARROW_BUFFER_TYPE_DATA:
+ return "data"
+
+ @property
+ def data_type(self):
+ return ArrowTypeString(self._buffer_data_type).decode("UTF-8")
+
+ @property
+ def format(self):
+ return self._format.decode("UTF-8")
+
+ @property
+ def item_size(self):
+ return self._strides
+
+ def __len__(self):
+ return self._shape
+
+ def __getitem__(self, int64_t i):
+ if i < 0 or i >= self._shape:
+ raise IndexError(f"Index {i} out of range")
+ cdef int64_t offset = self._strides * i
+ value = unpack_from(self.format, buffer=self, offset=offset)
+ if len(value) == 1:
+ return value[0]
+ else:
+ return value
+
+ def __iter__(self):
+ for value in iter_unpack(self.format, self):
+ if len(value) == 1:
+ yield value[0]
+ else:
+ yield value
cdef Py_ssize_t _item_size(self):
- if self._buffer_data_type == NANOARROW_TYPE_BOOL:
- return 1
- elif self._buffer_data_type == NANOARROW_TYPE_STRING:
- return 1
- elif self._buffer_data_type == NANOARROW_TYPE_BINARY:
+ if self._element_size_bits < 8:
return 1
else:
return self._element_size_bits // 8
- cdef const char* _get_format(self):
- if self._buffer_data_type == NANOARROW_TYPE_INT8:
- return "b"
+ cdef void _populate_format(self):
+ cdef const char* format_const = NULL
+ if self._element_size_bits == 0:
+ # Variable-size elements (e.g., data buffer for string or binary)
export as
+ # one byte per element (character if string, unspecified binary
otherwise)
+ if self._buffer_data_type == NANOARROW_TYPE_STRING:
+ format_const = "c"
+ else:
+ format_const = "B"
+ elif self._element_size_bits < 8:
+ # Bitmaps export as unspecified binary
+ format_const = "B"
+ elif self._buffer_data_type == NANOARROW_TYPE_INT8:
+ format_const = "b"
elif self._buffer_data_type == NANOARROW_TYPE_UINT8:
- return "B"
+ format_const = "B"
elif self._buffer_data_type == NANOARROW_TYPE_INT16:
- return "h"
+ format_const = "=h"
elif self._buffer_data_type == NANOARROW_TYPE_UINT16:
- return "H"
+ format_const = "=H"
elif self._buffer_data_type == NANOARROW_TYPE_INT32:
- return "i"
+ format_const = "=i"
elif self._buffer_data_type == NANOARROW_TYPE_UINT32:
- return "I"
+ format_const = "=I"
elif self._buffer_data_type == NANOARROW_TYPE_INT64:
- return "l"
+ format_const = "=q"
elif self._buffer_data_type == NANOARROW_TYPE_UINT64:
- return "L"
+ format_const = "=Q"
+ elif self._buffer_data_type == NANOARROW_TYPE_HALF_FLOAT:
+ format_const = "=e"
elif self._buffer_data_type == NANOARROW_TYPE_FLOAT:
- return "f"
+ format_const = "=f"
elif self._buffer_data_type == NANOARROW_TYPE_DOUBLE:
- return "d"
- elif self._buffer_data_type == NANOARROW_TYPE_STRING:
- return "c"
+ format_const = "=d"
+ elif self._buffer_data_type == NANOARROW_TYPE_INTERVAL_DAY_TIME:
+ format_const = "=ii"
+ elif self._buffer_data_type == NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
+ format_const = "=iiq"
+
+ if format_const != NULL:
+ snprintf(self._format, sizeof(self._format), "%s", format_const)
else:
- return "B"
+ snprintf(self._format, sizeof(self._format), "%ds",
self._element_size_bits // 8)
def __getbuffer__(self, Py_buffer *buffer, int flags):
if self._device.device_type != ARROW_DEVICE_CPU:
- raise RuntimeError("nanoarrow.BufferView is not a CPU array")
+ raise RuntimeError("nanoarrow.c_lib.CBufferView is not a CPU
buffer")
buffer.buf = <void*>self._ptr.data.data
- buffer.format = self._get_format()
+ buffer.format = self._format
buffer.internal = NULL
buffer.itemsize = self._strides
buffer.len = self._ptr.size_bytes
@@ -947,79 +936,18 @@ cdef class BufferView:
def __releasebuffer__(self, Py_buffer *buffer):
pass
+ def __repr__(self):
+ return f"<nanoarrow.c_lib.CBufferView>\n
{_lib_utils.buffer_view_repr(self)[1:]}"
-cdef class ArrayViewBuffers:
- """A lazily-resolved list of ArrayView buffers
- """
- cdef ArrayView _array_view
- cdef int64_t _length
-
- def __cinit__(self, ArrayView array_view):
- self._array_view = array_view
- self._length = 3
- for i in range(3):
- if self._array_view._ptr.layout.buffer_type[i] ==
NANOARROW_BUFFER_TYPE_NONE:
- self._length = i
- break
-
- def __len__(self):
- return self._length
-
- def __getitem__(self, k):
- k = int(k)
- if k < 0 or k >= self._length:
- raise IndexError(f"{k} out of range [0, {self._length})")
- cdef ArrowBufferView* buffer_view =
&(self._array_view._ptr.buffer_views[k])
- if buffer_view.data.data == NULL:
- return None
- return BufferView(
- self._array_view,
- <uintptr_t>buffer_view,
- self._array_view._ptr.layout.buffer_type[k],
- self._array_view._ptr.layout.buffer_data_type[k],
- self._array_view._ptr.layout.element_size_bits[k],
- <uintptr_t>self._array_view._device
- )
+cdef class CArrayStream:
+ """Low-level ArrowArrayStream wrapper
+ This object is a literal wrapper around an ArrowArrayStream. It provides
methods that
+ that wrap the underlying C callbacks and handles the C Data interface
lifecycle
+ (i.e., initialized ArrowArrayStream structures are always released).
-cdef class ArrayStream:
- """ArrowArrayStream wrapper
-
- This class provides a user-facing interface to access the fields of
- an ArrowArrayStream as defined in the Arrow C Stream interface.
- These objects are usually created using `nanoarrow.array_stream()`.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> pa_column = pa.array([1, 2, 3], pa.int32())
- >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
- >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
- >>> array_stream = na.array_stream(pa_reader)
- >>> array_stream.get_schema()
- <nanoarrow.Schema struct>
- - format: '+s'
- - name: ''
- - flags: 0
- - metadata: NULL
- - dictionary: NULL
- - children[1]:
- 'col1': <nanoarrow.Schema int32>
- - format: 'i'
- - name: 'col1'
- - flags: 2
- - metadata: NULL
- - dictionary: NULL
- - children[0]:
- >>> array_stream.get_next().length
- 3
- >>> array_stream.get_next() is None
- Traceback (most recent call last):
- ...
- StopIteration
+ See `nanoarrow.c_array_stream()` for construction and usage examples.
"""
cdef object _base
cdef ArrowArrayStream* _ptr
@@ -1027,8 +955,9 @@ cdef class ArrayStream:
@staticmethod
def allocate():
- base = ArrayStreamHolder()
- return ArrayStream(base, base._addr())
+ cdef ArrowArrayStream* c_array_stream_out
+ base = alloc_c_array_stream(&c_array_stream_out)
+ return CArrayStream(base, <uintptr_t>c_array_stream_out)
def __cinit__(self, object base, uintptr_t addr):
self._base = base
@@ -1046,7 +975,7 @@ cdef class ArrayStream:
A valid PyCapsule with name 'arrow_array_stream' containing an
ArrowArrayStream pointer.
"""
- return ArrayStream(
+ return CArrayStream(
stream_capsule,
<uintptr_t>PyCapsule_GetPointer(stream_capsule,
'arrow_array_stream')
)
@@ -1070,15 +999,11 @@ cdef class ArrayStream:
raise NotImplementedError("requested_schema")
cdef:
- ArrowArrayStream* c_stream_out
-
- stream_capsule = alloc_c_stream(&c_stream_out)
-
- # move the stream
- memcpy(c_stream_out, self._ptr, sizeof(ArrowArrayStream))
- self._ptr.release = NULL
+ ArrowArrayStream* c_array_stream_out
- return stream_capsule
+ array_stream_capsule = alloc_c_array_stream(&c_array_stream_out)
+ ArrowArrayStreamMove(self._ptr, c_array_stream_out)
+ return array_stream_capsule
def _addr(self):
return <uintptr_t>self._ptr
@@ -1092,7 +1017,7 @@ cdef class ArrayStream:
if self._ptr.release == NULL:
raise RuntimeError("array stream is released")
- def _get_schema(self, Schema schema):
+ def _get_schema(self, CSchema schema):
self._assert_valid()
cdef Error error = Error()
cdef int code = self._ptr.get_schema(self._ptr, schema._ptr)
@@ -1104,14 +1029,14 @@ cdef class ArrayStream:
def get_schema(self):
"""Get the schema associated with this stream
"""
- out = Schema.allocate()
+ out = CSchema.allocate()
self._get_schema(out)
return out
def get_next(self):
"""Get the next Array from this stream
- Returns None when there are no more arrays in this stream.
+ Raises StopIteration when there are no more arrays in this stream.
"""
self._assert_valid()
@@ -1120,11 +1045,11 @@ cdef class ArrayStream:
# which is guaranteed to call the C object's callback and
# faithfully pass on the returned value.
if self._cached_schema is None:
- self._cached_schema = Schema.allocate()
+ self._cached_schema = CSchema.allocate()
self._get_schema(self._cached_schema)
cdef Error error = Error()
- cdef Array array = Array.allocate(self._cached_schema)
+ cdef CArray array = CArray.allocate(self._cached_schema)
cdef int code = ArrowArrayStreamGetNext(self._ptr, array._ptr,
&error.c_error)
if code != NANOARROW_OK:
error.raise_error("ArrowArrayStream::get_next()", code)
@@ -1140,30 +1065,9 @@ cdef class ArrayStream:
def __next__(self):
return self.get_next()
- @staticmethod
- def allocate():
- base = ArrayStreamHolder()
- return ArrayStream(base, base._addr())
-
-
-cdef class DeviceArrayHolder:
- """Memory holder for an ArrowDeviceArray
-
- This class is responsible for the lifecycle of the ArrowDeviceArray
- whose memory it is responsible. When this object is deleted,
- a non-NULL release callback is invoked.
- """
- cdef ArrowDeviceArray c_array
-
- def __cinit__(self):
- self.c_array.array.release = NULL
-
- def __dealloc__(self):
- if self.c_array.array.release != NULL:
- ArrowArrayRelease(&self.c_array.array)
+ def __repr__(self):
+ return _lib_utils.array_stream_repr(self)
- def _addr(self):
- return <uintptr_t>&self.c_array
cdef class Device:
"""ArrowDevice wrapper
@@ -1180,17 +1084,18 @@ cdef class Device:
self._base = base,
self._ptr = <ArrowDevice*>addr
- def _array_init(self, uintptr_t array_addr, Schema schema):
+ def _array_init(self, uintptr_t array_addr, CSchema schema):
cdef ArrowArray* array_ptr = <ArrowArray*>array_addr
- cdef DeviceArrayHolder holder = DeviceArrayHolder()
- cdef int result = ArrowDeviceArrayInit(self._ptr, &holder.c_array,
array_ptr)
+ cdef ArrowDeviceArray* device_array_ptr
+ holder = alloc_c_device_array(&device_array_ptr)
+ cdef int result = ArrowDeviceArrayInit(self._ptr, device_array_ptr,
array_ptr)
if result != NANOARROW_OK:
Error.raise_error("ArrowDevice::init_array", result)
- return DeviceArray(holder, holder._addr(), schema)
+ return CDeviceArray(holder, <uintptr_t>device_array_ptr, schema)
def __repr__(self):
- return device_repr(self)
+ return _lib_utils.device_repr(self)
@property
def device_type(self):
@@ -1213,12 +1118,12 @@ cdef class Device:
return Device(None, <uintptr_t>ArrowDeviceCpu())
-cdef class DeviceArray:
+cdef class CDeviceArray:
cdef object _base
cdef ArrowDeviceArray* _ptr
- cdef Schema _schema
+ cdef CSchema _schema
- def __cinit__(self, object base, uintptr_t addr, Schema schema):
+ def __cinit__(self, object base, uintptr_t addr, CSchema schema):
self._base = base
self._ptr = <ArrowDeviceArray*>addr
self._schema = schema
@@ -1233,7 +1138,7 @@ cdef class DeviceArray:
@property
def array(self):
- return Array(self, <uintptr_t>&self._ptr.array, self._schema)
+ return CArray(self, <uintptr_t>&self._ptr.array, self._schema)
def __repr__(self):
- return device_array_repr(self)
+ return _lib_utils.device_array_repr(self)
diff --git a/python/src/nanoarrow/_lib_utils.py
b/python/src/nanoarrow/_lib_utils.py
index abbd1fc8..26085e30 100644
--- a/python/src/nanoarrow/_lib_utils.py
+++ b/python/src/nanoarrow/_lib_utils.py
@@ -23,11 +23,11 @@
def schema_repr(schema, indent=0):
indent_str = " " * indent
if schema._addr() == 0:
- return "<NULL nanoarrow.Schema>"
+ return "<NULL nanoarrow.c_lib.CSchema>"
elif not schema.is_valid():
- return "<released nanoarrow.Schema>"
+ return "<released nanoarrow.c_lib.CSchema>"
- lines = [f"<nanoarrow.Schema {schema._to_string()}>"]
+ lines = [f"<nanoarrow.c_lib.CSchema {schema._to_string()}>"]
for attr in ("format", "name", "flags"):
attr_repr = repr(getattr(schema, attr))
@@ -47,23 +47,28 @@ def schema_repr(schema, indent=0):
else:
lines.append(f"{indent_str}- dictionary: NULL")
- children = schema.children
- lines.append(f"{indent_str}- children[{len(children)}]:")
- for child in children:
+ lines.append(f"{indent_str}- children[{schema.n_children}]:")
+ for child in schema.children:
child_repr = schema_repr(child, indent=indent + 4)
lines.append(f"{indent_str} {repr(child.name)}: {child_repr}")
return "\n".join(lines)
-def array_repr(array, indent=0):
+def array_repr(array, indent=0, max_char_width=80):
+ if max_char_width < 20:
+ max_char_width = 20
+
indent_str = " " * indent
if array._addr() == 0:
- return "<NULL nanoarrow.Array>"
+ return "<NULL nanoarrow.c_lib.CArray>"
elif not array.is_valid():
- return "<released nanoarrow.Array>"
+ return "<released nanoarrow.c_lib.CArray>"
- lines = [f"<nanoarrow.Array {array.schema._to_string()}>"]
+ schema_string = array.schema._to_string(
+ max_chars=max_char_width - indent - 23, recursive=True
+ )
+ lines = [f"<nanoarrow.c_lib.CArray {schema_string}>"]
for attr in ("length", "offset", "null_count", "buffers"):
attr_repr = repr(getattr(array, attr))
lines.append(f"{indent_str}- {attr}: {attr_repr}")
@@ -74,17 +79,133 @@ def array_repr(array, indent=0):
else:
lines.append(f"{indent_str}- dictionary: NULL")
- children = array.children
- lines.append(f"{indent_str}- children[{len(children)}]:")
- for child in children:
+ lines.append(f"{indent_str}- children[{array.n_children}]:")
+ for child in array.children:
child_repr = array_repr(child, indent=indent + 4)
lines.append(f"{indent_str} {repr(child.schema.name)}: {child_repr}")
return "\n".join(lines)
+def schema_view_repr(schema_view):
+ lines = [
+ "<nanoarrow.c_lib.CSchemaView>",
+ f"- type: {repr(schema_view.type)}",
+ f"- storage_type: {repr(schema_view.storage_type)}",
+ ]
+
+ for attr_name in sorted(dir(schema_view)):
+ if attr_name.startswith("_") or attr_name in ("type", "storage_type"):
+ continue
+
+ attr_value = getattr(schema_view, attr_name)
+ if attr_value is None:
+ continue
+
+ lines.append(f"- {attr_name}: {repr(attr_value)}")
+
+ return "\n".join(lines)
+
+
+def array_view_repr(array_view, max_char_width=80, indent=0):
+ indent_str = " " * indent
+
+ lines = ["<nanoarrow.c_lib.CArrayView>"]
+
+ for attr in ("storage_type", "length", "offset", "null_count"):
+ attr_repr = repr(getattr(array_view, attr))
+ lines.append(f"{indent_str}- {attr}: {attr_repr}")
+
+ lines.append(f"{indent_str}- buffers[{array_view.n_buffers}]:")
+ for buffer in array_view.buffers:
+ lines.append(
+ f"{indent_str} - <{buffer_view_repr(buffer, max_char_width -
indent - 4)}>"
+ )
+
+ if array_view.dictionary:
+ dictionary_repr = array_view_repr(
+ array_view.dictionary, max_char_width=max_char_width,
indent=indent + 2
+ )
+ lines.append(f"{indent_str}- dictionary: {dictionary_repr}")
+ else:
+ lines.append(f"{indent_str}- dictionary: NULL")
+
+ lines.append(f"{indent_str}- children[{array_view.n_children}]:")
+ for child in array_view.children:
+ child_repr = array_view_repr(
+ child, max_char_width=max_char_width, indent=indent + 4
+ )
+ lines.append(f"{indent_str} - {child_repr}")
+
+ return "\n".join(lines)
+
+
+def buffer_view_repr(buffer_view, max_char_width=80):
+ if max_char_width < 20:
+ max_char_width = 20
+
+ prefix = f"{buffer_view.data_type} {buffer_view.type}"
+ prefix += f"[{buffer_view.size_bytes} b]"
+
+ if buffer_view.device_type == 1:
+ return (
+ prefix
+ + " "
+ + buffer_view_preview_cpu(buffer_view, max_char_width -
len(prefix) - 2)
+ )
+ else:
+ return prefix
+
+
+def buffer_view_preview_cpu(buffer_view, max_char_width):
+ if buffer_view.element_size_bits == 0:
+ preview_elements = max_char_width - 3
+ joined = repr(bytes(memoryview(buffer_view)[:preview_elements]))
+ elif buffer_view.element_size_bits == 1:
+ max_elements = max_char_width // 8
+ if max_elements > len(buffer_view):
+ preview_elements = len(buffer_view)
+ else:
+ preview_elements = max_elements
+
+ joined = "".join(
+ "".join(reversed(format(buffer_view[i], "08b")))
+ for i in range(preview_elements)
+ )
+ else:
+ max_elements = max_char_width // 3
+ if max_elements > len(buffer_view):
+ preview_elements = len(buffer_view)
+ else:
+ preview_elements = max_elements
+
+ joined = " ".join(repr(buffer_view[i]) for i in
range(preview_elements))
+
+ if len(joined) > max_char_width or preview_elements < len(buffer_view):
+ return joined[: (max_char_width - 3)] + "..."
+ else:
+ return joined
+
+
+def array_stream_repr(array_stream, max_char_width=80):
+ if array_stream._addr() == 0:
+ return "<NULL nanoarrow.c_lib.CArrayStream>"
+ elif not array_stream.is_valid():
+ return "<released nanoarrow.c_lib.CArrayStream>"
+
+ lines = ["<nanoarrow.c_lib.CArrayStream>"]
+ try:
+ schema = array_stream.get_schema()
+ schema_string = schema._to_string(max_chars=max_char_width - 16,
recursive=True)
+ lines.append(f"- get_schema(): {schema_string}")
+ except Exception as e:
+ lines.append(f"- get_schema(): <error calling get_schema(): {e}>")
+
+ return "\n".join(lines)
+
+
def device_array_repr(device_array):
- title_line = "<nanoarrow.device.DeviceArray>"
+ title_line = "<nanoarrow.device.c_lib.CDeviceArray>"
device_type = f"- device_type: {device_array.device_type}"
device_id = f"- device_id: {device_array.device_id}"
array = f"- array: {array_repr(device_array.array, indent=2)}"
diff --git a/python/src/nanoarrow/c_lib.py b/python/src/nanoarrow/c_lib.py
new file mode 100644
index 00000000..90ed2626
--- /dev/null
+++ b/python/src/nanoarrow/c_lib.py
@@ -0,0 +1,307 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Arrow and nanoarrow C structure wrappers
+
+These classes and their constructors wrap Arrow C Data/Stream interface
structures
+(i.e., ``ArrowArray``, ``ArrowSchema``, and ``ArrowArrayStream``) and the
+nanoarrow C library structures that help deserialize their content (i.e., the
+``ArrowSchemaView`` and ``ArrowArrayView``). These wrappers are currently
implemented
+in Cython and their scope is limited to lifecycle management and member access
as
+Python objects.
+"""
+
+from nanoarrow._lib import CArray, CArrayStream, CArrayView, CSchema,
CSchemaView
+
+
+def c_schema(obj=None) -> CSchema:
+ """ArrowSchema wrapper
+
+ The ``CSchema`` class provides a Python-friendly interface to access the
fields
+ of an ``ArrowSchema`` as defined in the Arrow C Data interface. These
objects
+ are created using `nanoarrow.c_schema()`, which accepts any schema or
+ data type-like object according to the Arrow PyCapsule interface.
+
+ This Python wrapper allows access to schema struct members but does not
+ automatically deserialize their content: use :func:`c_schema_view` to
validate
+ and deserialize the content into a more easily inspectable object.
+
+ Note that the :class:`CSchema` objects returned by ``.child()`` hold strong
+ references to the original `ArrowSchema` to avoid copies while inspecting
an
+ imported structure.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.c_schema(pa.int32())
+ >>> schema.is_valid()
+ True
+ >>> schema.format
+ 'i'
+ >>> schema.name
+ ''
+ """
+
+ if isinstance(obj, CSchema):
+ return obj
+
+ if hasattr(obj, "__arrow_c_schema__"):
+ return CSchema._import_from_c_capsule(obj.__arrow_c_schema__())
+
+ # for pyarrow < 14.0
+ if hasattr(obj, "_export_to_c"):
+ out = CSchema.allocate()
+ obj._export_to_c(out._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_schema"
+ )
+
+
+def c_array(obj=None, requested_schema=None) -> CArray:
+ """ArrowArray wrapper
+
+ This class provides a user-facing interface to access the fields of an
ArrowArray
+ as defined in the Arrow C Data interface, holding an optional reference to
a
+ :class:`CSchema` that can be used to safely deserialize the content.
+
+ These objects are created using :func:`c_array`, which accepts any
array-like
+ object according to the Arrow PyCapsule interface.
+
+ This Python wrapper allows access to array fields but does not
automatically
+ deserialize their content: use :func:`c_array_view` to validate and
deserialize
+ the content into a more easily inspectable object.
+
+ Note that the :class:`CArray` objects returned by ``.child()`` hold strong
+ references to the original ``ArrowSchema`` to avoid copies while
inspecting an
+ imported structure.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> import nanoarrow as na
+ >>> array = na.c_array(pa.array(["one", "two", "three", None]))
+ >>> array.length
+ 4
+ >>> array.null_count
+ 1
+ """
+
+ if requested_schema is not None:
+ requested_schema = c_schema(requested_schema)
+
+ if isinstance(obj, CArray) and requested_schema is None:
+ return obj
+
+ if hasattr(obj, "__arrow_c_array__"):
+ requested_schema_capsule = (
+ None if requested_schema is None else
requested_schema.__arrow_c_schema__()
+ )
+ return CArray._import_from_c_capsule(
+ *obj.__arrow_c_array__(requested_schema=requested_schema_capsule)
+ )
+
+ # for pyarrow < 14.0
+ if hasattr(obj, "_export_to_c"):
+ out = CArray.allocate(CSchema.allocate())
+ obj._export_to_c(out._addr(), out.schema._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_array"
+ )
+
+
+def c_array_stream(obj=None, requested_schema=None) -> CArrayStream:
+ """ArrowArrayStream wrapper
+
+ This class provides a user-facing interface to access the fields of
+ an ArrowArrayStream as defined in the Arrow C Stream interface.
+ These objects are usually created using `nanoarrow.c_array_stream()`.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> pa_column = pa.array([1, 2, 3], pa.int32())
+ >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
+ >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
+ >>> array_stream = na.c_array_stream(pa_reader)
+ >>> array_stream.get_schema()
+ <nanoarrow.c_lib.CSchema struct>
+ - format: '+s'
+ - name: ''
+ - flags: 0
+ - metadata: NULL
+ - dictionary: NULL
+ - children[1]:
+ 'col1': <nanoarrow.c_lib.CSchema int32>
+ - format: 'i'
+ - name: 'col1'
+ - flags: 2
+ - metadata: NULL
+ - dictionary: NULL
+ - children[0]:
+ >>> array_stream.get_next().length
+ 3
+ >>> array_stream.get_next() is None
+ Traceback (most recent call last):
+ ...
+ StopIteration
+ """
+
+ if requested_schema is not None:
+ requested_schema = c_schema(requested_schema)
+
+ if isinstance(obj, CArrayStream) and requested_schema is None:
+ return obj
+
+ if hasattr(obj, "__arrow_c_stream__"):
+ requested_schema_capsule = (
+ None if requested_schema is None else
requested_schema.__arrow_c_schema__()
+ )
+ return CArrayStream._import_from_c_capsule(
+ obj.__arrow_c_stream__(requested_schema=requested_schema_capsule)
+ )
+
+ # for pyarrow < 14.0
+ if hasattr(obj, "_export_to_c"):
+ out = CArrayStream.allocate()
+ obj._export_to_c(out._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} "
+ "to nanoarrow.c_array_stream"
+ )
+
+
+def c_schema_view(obj) -> CSchemaView:
+ """ArrowSchemaView wrapper
+
+ The ``ArrowSchemaView`` is a nanoarrow C library structure that facilitates
+ access to the deserialized content of an ``ArrowSchema`` (e.g., parameter
values for
+ parameterized types). This wrapper extends that facility to Python.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.c_schema(pa.decimal128(10, 3))
+ >>> schema_view = na.c_schema_view(schema)
+ >>> schema_view.type
+ 'decimal128'
+ >>> schema_view.decimal_bitwidth
+ 128
+ >>> schema_view.decimal_precision
+ 10
+ >>> schema_view.decimal_scale
+ 3
+ """
+
+ if isinstance(obj, CSchemaView):
+ return obj
+
+ return CSchemaView(c_schema(obj))
+
+
+def c_array_view(obj, requested_schema=None) -> CArrayView:
+ """ArrowArrayView wrapper
+
+ The ``ArrowArrayView`` is a nanoarrow C library structure that provides
+ structured access to buffers addresses, buffer sizes, and buffer
+ data types. The buffer data is usually propagated from an ArrowArray
+ but can also be propagated from other types of objects (e.g., serialized
+ IPC). The offset and length of this view are independent of its parent
+ (i.e., this object can also represent a slice of its parent).
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> import nanoarrow as na
+ >>> array = na.c_array(pa.array(["one", "two", "three", None]))
+ >>> array_view = na.c_array_view(array)
+ >>> np.array(array_view.buffer(1))
+ array([ 0, 3, 6, 11, 11], dtype=int32)
+ >>> np.array(array_view.buffer(2))
+ array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
+ dtype='|S1')
+ """
+
+ if isinstance(obj, CArrayView) and requested_schema is None:
+ return obj
+
+ return CArrayView.from_cpu_array(c_array(obj, requested_schema))
+
+
+def allocate_c_schema():
+ """Allocate an uninitialized ArrowSchema wrapper
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.allocate_c_schema()
+ >>> pa.int32()._export_to_c(schema._addr())
+ """
+ return CSchema.allocate()
+
+
+def allocate_c_array(requested_schema=None):
+ """Allocate an uninitialized ArrowArray
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.allocate_c_schema()
+ >>> pa.int32()._export_to_c(schema._addr())
+ """
+ if requested_schema is not None:
+ requested_schema = c_schema(requested_schema)
+
+ return CArray.allocate(
+ CSchema.allocate() if requested_schema is None else requested_schema
+ )
+
+
+def allocate_c_array_stream():
+ """Allocate an uninitialized ArrowArrayStream wrapper
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> pa_column = pa.array([1, 2, 3], pa.int32())
+ >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
+ >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
+ >>> array_stream = na.allocate_c_array_stream()
+ >>> pa_reader._export_to_c(array_stream._addr())
+ """
+ return CArrayStream.allocate()
diff --git a/python/src/nanoarrow/device.py b/python/src/nanoarrow/device.py
index 9fe41511..15150cf1 100644
--- a/python/src/nanoarrow/device.py
+++ b/python/src/nanoarrow/device.py
@@ -15,15 +15,15 @@
# specific language governing permissions and limitations
# under the License.
-from nanoarrow._lib import Device, DeviceArray
-from nanoarrow.lib import array
+from nanoarrow._lib import CDeviceArray, Device
+from nanoarrow.c_lib import c_array
-def device_array(obj):
- if isinstance(obj, DeviceArray):
+def c_device_array(obj):
+ if isinstance(obj, CDeviceArray):
return obj
# Only CPU for now
- cpu_array = array(obj)
+ cpu_array = c_array(obj)
return Device.cpu()._array_init(cpu_array._addr(), cpu_array.schema)
diff --git a/python/src/nanoarrow/lib.py b/python/src/nanoarrow/lib.py
deleted file mode 100644
index 43e91dda..00000000
--- a/python/src/nanoarrow/lib.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from nanoarrow._lib import Array, ArrayStream, ArrayView, Schema
-
-
-def array_view(obj):
- if isinstance(obj, ArrayView):
- return obj
-
- return ArrayView.from_cpu_array(array(obj))
-
-
-def schema(obj):
- if isinstance(obj, Schema):
- return obj
-
- if hasattr(obj, "__arrow_c_schema__"):
- return Schema._import_from_c_capsule(obj.__arrow_c_schema__())
-
- # for pyarrow < 14.0
- if hasattr(obj, "_export_to_c"):
- out = Schema.allocate()
- obj._export_to_c(out._addr())
- return out
- else:
- raise TypeError(
- f"Can't convert object of type {type(obj).__name__} to
nanoarrow.Schema"
- )
-
-
-def array(obj):
- if isinstance(obj, Array):
- return obj
-
- if hasattr(obj, "__arrow_c_array__"):
- # TODO support requested schema
- return Array._import_from_c_capsule(*obj.__arrow_c_array__())
-
- # for pyarrow < 14.0
- if hasattr(obj, "_export_to_c"):
- out = Array.allocate(Schema.allocate())
- obj._export_to_c(out._addr(), out.schema._addr())
- return out
- else:
- raise TypeError(
- f"Can't convert object of type {type(obj).__name__} to
nanoarrow.Array"
- )
-
-
-def array_stream(obj):
- if isinstance(obj, ArrayStream):
- return obj
-
- if hasattr(obj, "__arrow_c_stream__"):
- # TODO support requested schema
- return ArrayStream._import_from_c_capsule(obj.__arrow_c_stream__())
-
- # for pyarrow < 14.0
- if hasattr(obj, "_export_to_c"):
- out = ArrayStream.allocate()
- obj._export_to_c(out._addr())
- return out
- else:
- raise TypeError(
- f"Can't convert object of type {type(obj).__name__} "
- "to nanoarrow.ArrowArrayStream"
- )
diff --git a/python/tests/test_capsules.py b/python/tests/test_capsules.py
index aef037f9..ae418cb7 100644
--- a/python/tests/test_capsules.py
+++ b/python/tests/test_capsules.py
@@ -50,7 +50,7 @@ def test_schema():
pa_schema = pa.schema([pa.field("some_name", pa.int32())])
for schema_obj in [pa_schema, SchemaWrapper(pa_schema)]:
- schema = na.schema(schema_obj)
+ schema = na.c_schema(schema_obj)
# some basic validation
assert schema.is_valid()
assert schema.format == "+s"
@@ -68,7 +68,7 @@ def test_array():
pa_arr = pa.array([1, 2, 3], pa.int32())
for arr_obj in [pa_arr, ArrayWrapper(pa_arr)]:
- array = na.array(arr_obj)
+ array = na.c_array(arr_obj)
# some basic validation
assert array.is_valid()
assert array.length == 3
@@ -81,11 +81,17 @@ def test_array():
assert array.is_valid()
+def test_array_requested_schema():
+ pa_arr = pa.array([1, 2, 3], pa.int32())
+ array = na.c_array(pa_arr, requested_schema=pa.int64())
+ assert array.schema.format == "l"
+
+
def test_array_stream():
pa_table = pa.table({"some_column": pa.array([1, 2, 3], pa.int32())})
for stream_obj in [pa_table, StreamWrapper(pa_table)]:
- array_stream = na.array_stream(stream_obj)
+ array_stream = na.c_array_stream(stream_obj)
# some basic validation
assert array_stream.is_valid()
array = array_stream.get_next()
@@ -96,7 +102,7 @@ def test_array_stream():
)
# roundtrip
- array_stream = na.array_stream(stream_obj)
+ array_stream = na.c_array_stream(stream_obj)
pa_table2 = pa.table(array_stream)
assert pa_table2.equals(pa_table)
# exporting a stream marks the original object as released (it is
moved)
@@ -106,19 +112,28 @@ def test_array_stream():
pa.table(array_stream)
+def test_array_stream_requested_schema():
+ pa_table = pa.table({"some_column": pa.array([1, 2, 3], pa.int32())})
+ schema2 = pa.schema([pa.field("some_column", pa.int64())])
+
+ # Not implemented in pyarrow yet
+ with pytest.raises(NotImplementedError):
+ na.c_array_stream(pa_table, requested_schema=schema2)
+
+
def test_export_invalid():
- schema = na.Schema.allocate()
+ schema = na.allocate_c_schema()
assert schema.is_valid() is False
with pytest.raises(RuntimeError, match="schema is released"):
pa.schema(schema)
- array = na.Array.allocate(na.Schema.allocate())
+ array = na.allocate_c_array()
assert array.is_valid() is False
- with pytest.raises(RuntimeError, match="Array is released"):
+ with pytest.raises(RuntimeError, match="CArray is released"):
pa.array(array)
- array_stream = na.ArrayStream.allocate()
+ array_stream = na.allocate_c_array_stream()
assert array_stream.is_valid() is False
with pytest.raises(RuntimeError, match="array stream is released"):
pa.table(array_stream)
@@ -129,21 +144,21 @@ def test_import_from_c_errors():
pa_arr = pa.array([1, 2, 3], pa.int32())
with pytest.raises(ValueError):
- na.Schema._import_from_c_capsule("wrong")
+ na.c_lib.CSchema._import_from_c_capsule("wrong")
with pytest.raises(ValueError):
- na.Schema._import_from_c_capsule(pa_arr.__arrow_c_array__())
+ na.c_lib.CSchema._import_from_c_capsule(pa_arr.__arrow_c_array__())
with pytest.raises(ValueError):
- na.Array._import_from_c_capsule("wrong", "wrong")
+ na.c_lib.CArray._import_from_c_capsule("wrong", "wrong")
with pytest.raises(ValueError):
- na.Array._import_from_c_capsule(
+ na.c_lib.CArray._import_from_c_capsule(
pa_arr.__arrow_c_array__(), pa_arr.type.__arrow_c_schema__()
)
with pytest.raises(ValueError):
- na.ArrayStream._import_from_c_capsule("wrong")
+ na.c_lib.CArrayStream._import_from_c_capsule("wrong")
with pytest.raises(ValueError):
- na.ArrayStream._import_from_c_capsule(pa_arr.__arrow_c_array__())
+
na.c_lib.CArrayStream._import_from_c_capsule(pa_arr.__arrow_c_array__())
diff --git a/python/tests/test_device.py b/python/tests/test_device.py
index 6ba6773c..fa1f1379 100644
--- a/python/tests/test_device.py
+++ b/python/tests/test_device.py
@@ -33,10 +33,10 @@ def test_cpu_device():
pa_array = pa.array([1, 2, 3])
- darray = device.device_array(pa_array)
+ darray = device.c_device_array(pa_array)
assert darray.device_type == 1
assert darray.device_id == 0
assert darray.array.length == 3
assert "device_type: 1" in repr(darray)
- assert device.device_array(darray) is darray
+ assert device.c_device_array(darray) is darray
diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py
index 5d0d9840..35bd7ea5 100644
--- a/python/tests/test_nanoarrow.py
+++ b/python/tests/test_nanoarrow.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import re
import sys
import pytest
@@ -25,75 +26,81 @@ np = pytest.importorskip("numpy")
pa = pytest.importorskip("pyarrow")
-def test_schema_helper():
- schema = na.Schema.allocate()
- assert na.schema(schema) is schema
+def test_c_version():
+ re_version = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$")
+ assert re_version.match(na.c_version()) is not None
- schema = na.schema(pa.null())
- assert isinstance(schema, na.Schema)
+
+def test_c_schema_helper():
+ schema = na.allocate_c_schema()
+ assert na.c_schema(schema) is schema
+
+ schema = na.c_schema(pa.null())
+ assert isinstance(schema, na.c_lib.CSchema)
with pytest.raises(TypeError):
- na.schema(None)
+ na.c_schema(1234)
-def test_array_helper():
- array = na.Array.allocate(na.Schema.allocate())
- assert na.array(array) is array
+def test_c_array_helper():
+ array = na.allocate_c_array()
+ assert na.c_array(array) is array
- array = na.array(pa.array([], pa.null()))
- assert isinstance(array, na.Array)
+ array = na.c_array(pa.array([], pa.null()))
+ assert isinstance(array, na.c_lib.CArray)
with pytest.raises(TypeError):
- na.array(None)
+ na.c_array(1234)
def test_array_stream_helper():
- array_stream = na.ArrayStream.allocate()
- assert na.array_stream(array_stream) is array_stream
+ array_stream = na.allocate_c_array_stream()
+ assert na.c_array_stream(array_stream) is array_stream
with pytest.raises(TypeError):
- na.array_stream(None)
+ na.c_array_stream(1234)
def test_array_view_helper():
- array = na.array(pa.array([1, 2, 3]))
- view = na.array_view(array)
- assert isinstance(view, na.ArrayView)
- assert na.array_view(view) is view
+ array = na.c_array(pa.array([1, 2, 3]))
+ view = na.c_array_view(array)
+ assert isinstance(view, na.c_lib.CArrayView)
+ assert na.c_array_view(view) is view
-def test_schema_basic():
- schema = na.Schema.allocate()
+def test_c_schema_basic():
+ schema = na.allocate_c_schema()
assert schema.is_valid() is False
assert schema._to_string() == "[invalid: schema is released]"
- assert repr(schema) == "<released nanoarrow.Schema>"
+ assert repr(schema) == "<released nanoarrow.c_lib.CSchema>"
- schema = na.schema(pa.schema([pa.field("some_name", pa.int32())]))
+ schema = na.c_schema(pa.schema([pa.field("some_name", pa.int32())]))
assert schema.format == "+s"
assert schema.flags == 0
assert schema.metadata is None
- assert len(schema.children) == 1
- assert schema.children[0].format == "i"
- assert schema.children[0].name == "some_name"
- assert schema.children[0]._to_string() == "int32"
- assert "<nanoarrow.Schema int32>" in repr(schema)
+ assert schema.n_children == 1
+ assert len(list(schema.children)) == 1
+ assert schema.child(0).format == "i"
+ assert schema.child(0).name == "some_name"
+ assert schema.child(0)._to_string() == "int32"
+ assert "<nanoarrow.c_lib.CSchema int32>" in repr(schema)
assert schema.dictionary is None
with pytest.raises(IndexError):
- schema.children[1]
+ schema.child(1)
-def test_schema_dictionary():
- schema = na.schema(pa.dictionary(pa.int32(), pa.utf8()))
+def test_c_schema_dictionary():
+ schema = na.c_schema(pa.dictionary(pa.int32(), pa.utf8()))
assert schema.format == "i"
assert schema.dictionary.format == "u"
- assert "dictionary: <nanoarrow.Schema string" in repr(schema)
+ assert "dictionary: <nanoarrow.c_lib.CSchema string" in repr(schema)
def test_schema_metadata():
meta = {"key1": "value1", "key2": "value2"}
- schema = na.schema(pa.field("", pa.int32(), metadata=meta))
+ schema = na.c_schema(pa.field("", pa.int32(), metadata=meta))
assert len(schema.metadata) == 2
@@ -103,13 +110,14 @@ def test_schema_metadata():
assert "'key1': b'value1'" in repr(schema)
-def test_schema_view():
- schema = na.Schema.allocate()
+def test_c_schema_view():
+ schema = na.allocate_c_schema()
with pytest.raises(RuntimeError):
- schema.view()
+ na.c_schema_view(schema)
- schema = na.schema(pa.int32())
- view = schema.view()
+ schema = na.c_schema(pa.int32())
+ view = na.c_schema_view(schema)
+ assert "- type: 'int32'" in repr(view)
assert view.type == "int32"
assert view.storage_type == "int32"
@@ -123,32 +131,32 @@ def test_schema_view():
assert view.extension_metadata is None
-def test_schema_view_extra_params():
- schema = na.schema(pa.binary(12))
- view = schema.view()
+def test_c_schema_view_extra_params():
+ schema = na.c_schema(pa.binary(12))
+ view = na.c_schema_view(schema)
assert view.fixed_size == 12
- schema = na.schema(pa.list_(pa.int32(), 12))
+ schema = na.c_schema(pa.list_(pa.int32(), 12))
assert view.fixed_size == 12
- schema = na.schema(pa.decimal128(10, 3))
- view = schema.view()
+ schema = na.c_schema(pa.decimal128(10, 3))
+ view = na.c_schema_view(schema)
assert view.decimal_bitwidth == 128
assert view.decimal_precision == 10
assert view.decimal_scale == 3
- schema = na.schema(pa.decimal256(10, 3))
- view = schema.view()
+ schema = na.c_schema(pa.decimal256(10, 3))
+ view = na.c_schema_view(schema)
assert view.decimal_bitwidth == 256
assert view.decimal_precision == 10
assert view.decimal_scale == 3
- schema = na.schema(pa.duration("us"))
- view = schema.view()
+ schema = na.c_schema(pa.duration("us"))
+ view = na.c_schema_view(schema)
assert view.time_unit == "us"
- schema = na.schema(pa.timestamp("us", tz="America/Halifax"))
- view = schema.view()
+ schema = na.c_schema(pa.timestamp("us", tz="America/Halifax"))
+ view = na.c_schema_view(schema)
assert view.type == "timestamp"
assert view.storage_type == "int64"
assert view.time_unit == "us"
@@ -158,56 +166,61 @@ def test_schema_view_extra_params():
"ARROW:extension:name": "some_name",
"ARROW:extension:metadata": "some_metadata",
}
- schema = na.schema(pa.field("", pa.int32(), metadata=meta))
- view = schema.view()
+ schema = na.c_schema(pa.field("", pa.int32(), metadata=meta))
+ view = na.c_schema_view(schema)
assert view.extension_name == "some_name"
assert view.extension_metadata == b"some_metadata"
-def test_array_empty():
- array = na.Array.allocate(na.Schema.allocate())
+def test_c_array_empty():
+ array = na.allocate_c_array()
assert array.is_valid() is False
- assert repr(array) == "<released nanoarrow.Array>"
+ assert repr(array) == "<released nanoarrow.c_lib.CArray>"
-def test_array():
- array = na.array(pa.array([1, 2, 3], pa.int32()))
+def test_c_array():
+ array = na.c_array(pa.array([1, 2, 3], pa.int32()))
assert array.is_valid() is True
assert array.length == 3
assert array.offset == 0
assert array.null_count == 0
+ assert array.n_buffers == 2
assert len(array.buffers) == 2
assert array.buffers[0] == 0
- assert len(array.children) == 0
+ assert array.n_children == 0
+ assert len(list(array.children)) == 0
assert array.dictionary is None
- assert "<nanoarrow.Array int32" in repr(array)
+ assert "<nanoarrow.c_lib.CArray int32" in repr(array)
-def test_array_recursive():
- array = na.array(pa.record_batch([pa.array([1, 2, 3], pa.int32())],
["col"]))
- assert len(array.children) == 1
- assert array.children[0].length == 3
- assert array.children[0].schema._to_string() == "int32"
- assert "'col': <nanoarrow.Array int32" in repr(array)
+def test_c_array_recursive():
+ array = na.c_array(pa.record_batch([pa.array([1, 2, 3], pa.int32())],
["col"]))
+ assert array.n_children == 1
+ assert len(list(array.children)) == 1
+ assert array.child(0).length == 3
+ assert array.child(0).schema._to_string() == "int32"
+ assert "'col': <nanoarrow.c_lib.CArray int32" in repr(array)
with pytest.raises(IndexError):
- array.children[1]
+ array.child(-1)
-def test_array_dictionary():
- array = na.array(pa.array(["a", "b", "b"]).dictionary_encode())
+def test_c_array_dictionary():
+ array = na.c_array(pa.array(["a", "b", "b"]).dictionary_encode())
assert array.length == 3
assert array.dictionary.length == 2
- assert "dictionary: <nanoarrow.Array string>" in repr(array)
+ assert "dictionary: <nanoarrow.c_lib.CArray string>" in repr(array)
-def test_array_view():
- array = na.array(pa.array([1, 2, 3], pa.int32()))
- view = na.array_view(array)
+def test_c_array_view():
+ array = na.c_array(pa.array([1, 2, 3], pa.int32()))
+ view = na.c_array_view(array)
- assert view.schema is array.schema
+ assert view.storage_type == "int32"
+ assert "- storage_type: 'int32'" in repr(view)
+ assert "<int32 data[12 b] 1 2 3>" in repr(view)
- data_buffer = memoryview(view.buffers[1])
+ data_buffer = memoryview(view.buffer(1))
data_buffer_copy = bytes(data_buffer)
assert len(data_buffer_copy) == 12
@@ -217,46 +230,52 @@ def test_array_view():
assert data_buffer_copy ==
b"\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03"
with pytest.raises(IndexError):
- view.children[1]
+ view.child(0)
+
+ with pytest.raises(IndexError):
+ view.child(-1)
-def test_array_view_recursive():
+def test_c_array_view_recursive():
pa_array_child = pa.array([1, 2, 3], pa.int32())
pa_array = pa.record_batch([pa_array_child], names=["some_column"])
- array = na.array(pa_array)
+ array = na.c_array(pa_array)
assert array.schema.format == "+s"
assert array.length == 3
- assert len(array.children) == 1
+ assert array.n_children == 1
+ assert len(list(array.children)) == 1
- assert array.children[0].schema.format == "i"
- assert array.children[0].length == 3
- assert array.children[0].schema._addr() == array.schema.children[0]._addr()
+ assert array.child(0).schema.format == "i"
+ assert array.child(0).length == 3
+ assert array.child(0).schema._addr() == array.schema.child(0)._addr()
- view = na.array_view(array)
- assert len(view.buffers) == 1
- assert len(view.children) == 1
- assert view.schema._addr() == array.schema._addr()
+ view = na.c_array_view(array)
+ assert view.n_buffers == 1
+ assert len(list(view.buffers)) == 1
+ assert view.n_children == 1
+ assert len(list(view.children)) == 1
- assert len(view.children[0].buffers) == 2
- assert view.children[0].schema._addr() == array.schema.children[0]._addr()
- assert view.children[0].schema._addr() == array.children[0].schema._addr()
+ assert view.child(0).n_buffers == 2
+ assert len(list(view.child(0).buffers)) == 2
+ assert "- children[1]" in repr(view)
-def test_array_view_dictionary():
+def test_c_array_view_dictionary():
pa_array = pa.array(["a", "b", "b"], pa.dictionary(pa.int32(), pa.utf8()))
- array = na.array(pa_array)
+ array = na.c_array(pa_array)
assert array.schema.format == "i"
assert array.dictionary.schema.format == "u"
- view = na.array_view(array)
- assert len(view.buffers) == 2
- assert len(view.dictionary.buffers) == 3
+ view = na.c_array_view(array)
+ assert view.n_buffers == 2
+ assert view.dictionary.n_buffers == 3
+ assert "- dictionary: <nanoarrow.c_lib.CArrayView>" in repr(view)
-def test_buffers_data():
+def test_buffers_integer():
data_types = [
(pa.uint8(), np.uint8()),
(pa.int8(), np.int8()),
@@ -266,42 +285,177 @@ def test_buffers_data():
(pa.int32(), np.int32()),
(pa.uint64(), np.uint64()),
(pa.int64(), np.int64()),
+ ]
+
+ for pa_type, np_type in data_types:
+ view = na.c_array_view(pa.array([0, 1, 2], pa_type))
+ data_buffer = view.buffer(1)
+
+ # Check via buffer interface
+ np.testing.assert_array_equal(
+ np.array(data_buffer), np.array([0, 1, 2], np_type)
+ )
+
+ # Check via iterator interface
+ assert list(data_buffer) == [0, 1, 2]
+
+ # Check via buffer get_item interface
+ assert [data_buffer[i] for i in range(len(data_buffer))] ==
list(data_buffer)
+
+ # Check repr
+ assert "0 1 2" in repr(data_buffer)
+
+
+def test_buffers_float():
+ data_types = [
(pa.float32(), np.float32()),
(pa.float64(), np.float64()),
]
for pa_type, np_type in data_types:
- view = na.array_view(pa.array([0, 1, 2], pa_type))
+ view = na.c_array_view(pa.array([0, 1, 2], pa_type))
+ data_buffer = view.buffer(1)
+
+ # Check via buffer interface
np.testing.assert_array_equal(
- np.array(view.buffers[1]), np.array([0, 1, 2], np_type)
+ np.array(data_buffer), np.array([0, 1, 2], np_type)
)
+ # Check via iterator interface
+ assert list(data_buffer) == [0.0, 1.0, 2.0]
+
+ # Check via buffer get_item interface
+ assert [data_buffer[i] for i in range(len(data_buffer))] ==
list(data_buffer)
+
+ # Check repr
+ assert "0.0 1.0 2.0" in repr(data_buffer)
+
+
+def test_buffers_half_float():
+ # pyarrrow can only create half_float from np.float16()
+ np_array = np.array([0, 1, 2], np.float16())
+ view = na.c_array_view(pa.array(np_array))
+ data_buffer = view.buffer(1)
+
+ # Check via buffer interface
+ np.testing.assert_array_equal(
+ np.array(data_buffer), np.array([0, 1, 2], np.float16())
+ )
+
+ # Check via iterator interface
+ assert list(data_buffer) == [0.0, 1.0, 2.0]
+
+ # Check via buffer get_item interface
+ assert [data_buffer[i] for i in range(len(data_buffer))] ==
list(data_buffer)
+
+ # Check repr
+ assert "0.0 1.0 2.0" in repr(data_buffer)
+
+
+def test_buffers_bool():
+ view = na.c_array_view(pa.array([True, True, True, False]))
+ data_buffer = view.buffer(1)
+
+ assert data_buffer.size_bytes == 1
+
+ # Check via buffer interface
+ np.testing.assert_array_equal(
+ np.array(data_buffer), np.array([1 + 2 + 4], np.int32())
+ )
+
+ # Check via iterator interface
+ assert list(data_buffer) == [1 + 2 + 4]
+
+ # Check via buffer get_item interface
+ assert [data_buffer[i] for i in range(len(data_buffer))] ==
list(data_buffer)
+
+ # Check repr
+ assert "11100000" in repr(data_buffer)
+
def test_buffers_string():
- view = na.array_view(pa.array(["a", "bc", "def"]))
+ view = na.c_array_view(pa.array(["a", "bc", "def"]))
+
+ assert view.buffer(0).size_bytes == 0
+ assert view.buffer(1).size_bytes == 16
+ assert view.buffer(2).size_bytes == 6
- assert view.buffers[0] is None
+ # Check via buffer interface
np.testing.assert_array_equal(
- np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32())
+ np.array(view.buffer(1)), np.array([0, 1, 3, 6], np.int32())
)
np.testing.assert_array_equal(
- np.array(view.buffers[2]), np.array(list("abcdef"), dtype="|S1")
+ np.array(view.buffer(2)), np.array(list("abcdef"), dtype="|S1")
)
+ # Check via iterator interface
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [0, 1, 3, 6]
+ assert list(view.buffer(2)) == [item.encode("UTF-8") for item in "abcdef"]
+
+ # Check repr
+ assert "b'abcdef'" in repr(view.buffer(2))
+
def test_buffers_binary():
- view = na.array_view(pa.array([b"a", b"bc", b"def"]))
+ view = na.c_array_view(pa.array([b"a", b"bc", b"def"]))
- assert view.buffers[0] is None
+ assert view.buffer(0).size_bytes == 0
+ assert view.buffer(1).size_bytes == 16
+ assert view.buffer(2).size_bytes == 6
+
+ # Check via buffer interface
+ np.testing.assert_array_equal(
+ np.array(view.buffer(1)), np.array([0, 1, 3, 6], np.int32())
+ )
+ np.testing.assert_array_equal(np.array(view.buffer(2)),
np.array(list(b"abcdef")))
np.testing.assert_array_equal(
- np.array(view.buffers[1]), np.array([0, 1, 3, 6], np.int32())
+ np.array(list(view.buffer(2))), np.array(list(b"abcdef"))
)
- np.testing.assert_array_equal(np.array(view.buffers[2]),
np.array(list(b"abcdef")))
+ # Check via iterator interface
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [0, 1, 3, 6]
+ assert list(view.buffer(2)) == [int(item) for item in b"abcdef"]
+
+ # Check repr
+ assert "b'abcdef'" in repr(view.buffer(2))
+
+
+def test_buffers_fixed_size_binary():
+ view = na.c_array_view(pa.array([b"abc", b"def", b"ghi"], pa.binary(3)))
-def test_array_stream():
- array_stream = na.ArrayStream.allocate()
- assert na.array_stream(array_stream) is array_stream
+ assert view.buffer(1).size_bytes == 9
+
+ # Check via buffer interface
+ np.testing.assert_array_equal(
+ np.array(list(view.buffer(1))), np.array([b"abc", b"def", b"ghi"])
+ )
+
+ # Check via iterator interface
+ assert list(view.buffer(1)) == [b"abc", b"def", b"ghi"]
+
+
+def test_buffers_interval_month_day_nano():
+ view = na.c_array_view(
+ pa.array([pa.scalar((1, 15, -30), type=pa.month_day_nano_interval())])
+ )
+
+ assert view.buffer(1).size_bytes == 16
+
+ # Check via buffer interface
+ np.testing.assert_array_equal(
+ np.array(list(view.buffer(1))), np.array([(1, 15, -30)])
+ )
+
+ # Check via iterator interface
+ assert list(view.buffer(1)) == [(1, 15, -30)]
+
+
+def test_c_array_stream():
+ array_stream = na.allocate_c_array_stream()
+ assert na.c_array_stream(array_stream) is array_stream
+ assert repr(array_stream) == "<released nanoarrow.c_lib.CArrayStream>"
assert array_stream.is_valid() is False
with pytest.raises(RuntimeError):
@@ -312,21 +466,23 @@ def test_array_stream():
pa_array_child = pa.array([1, 2, 3], pa.int32())
pa_array = pa.record_batch([pa_array_child], names=["some_column"])
reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
- array_stream = na.array_stream(reader)
+ array_stream = na.c_array_stream(reader)
assert array_stream.is_valid() is True
+ assert "struct<some_column: int32>" in repr(array_stream)
+
array = array_stream.get_next()
- assert array.schema.children[0].name == "some_column"
+ assert array.schema.child(0).name == "some_column"
with pytest.raises(StopIteration):
array_stream.get_next()
-def test_array_stream_iter():
+def test_c_array_stream_iter():
pa_array_child = pa.array([1, 2, 3], pa.int32())
pa_array = pa.record_batch([pa_array_child], names=["some_column"])
reader = pa.RecordBatchReader.from_batches(pa_array.schema, [pa_array])
- array_stream = na.array_stream(reader)
+ array_stream = na.c_array_stream(reader)
arrays = list(array_stream)
assert len(arrays) == 1
- assert arrays[0].schema.children[0].name == "some_column"
+ assert arrays[0].schema.child(0).name == "some_column"