joellubi commented on code in PR #43488:
URL: https://github.com/apache/arrow/pull/43488#discussion_r1706066173
##########
python/pyarrow/array.pxi:
##########
@@ -4447,6 +4447,69 @@ cdef class FixedShapeTensorArray(ExtensionArray):
FixedSizeListArray.from_arrays(values, shape[1:].prod())
)
+cdef class Bool8Array(ExtensionArray):
+ """
+ Concrete class for bool8 extension arrays.
+ Examples
+ --------
+ Define the extension type for an bool8 array
+ >>> import pyarrow as pa
+ >>> bool8_type = pa.bool8()
+ Create an extension array
+ >>> arr = [-1, 0, 1, 2, None]
+ >>> storage = pa.array(arr, pa.int8())
+ >>> pa.ExtensionArray.from_storage(bool8_type, storage)
+ <pyarrow.lib.Bool8Array object at ...>
+ [
+ -1,
+ 0,
+ 1,
+ 2,
+ null
+ ]
+ """
+
+ def to_numpy(self, zero_copy_only=True, writable=False):
+ try:
+ return self.storage.to_numpy().view(np.bool_)
+ except ArrowInvalid as e:
+ if zero_copy_only:
+ raise e
+
+ return _pc().not_equal(self.storage,
0).to_numpy(zero_copy_only=zero_copy_only, writable=writable)
+
+ @staticmethod
+ def from_numpy(obj):
+ """
+ Convert numpy array to a bool8 extension array without making a copy.
+ The input array must be 1-dimensional, with either bool_ or int8 dtype.
+
+ Parameters
+ ----------
+ obj : numpy.ndarray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> arr = np.array([True, False, True], dtype=np.bool_)
+ >>> pa.Bool8Array.from_numpy(arr)
+ <pyarrow.lib.Bool8Array object at ...>
+ [
+ 1,
+ 0,
+ 1
+ ]
+ """
+
+ if obj.ndim != 1:
+ raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8
array")
+
+ if obj.dtype not in [np.bool_, np.int8]:
+ raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8
storage")
+
+ buf = foreign_buffer(obj.ctypes.data, obj.size)
+ return Array.from_buffers(bool8(), obj.size, [None, buf])
Review Comment:
I gave this a try and it works if the numpy array has `dtype=np.int8`:
```python
np_arr = np.array([1, 0, 1], dtype=np.int8)
pa_storage_arr = pa.array(np_arr, type=pa.int8())
pa_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), pa_storage_arr)
```
This does not produce any copies. The existing approach of using
`foreign_buffer` also works with `np_arr = np.array([True, False, True],
dtype=np.bool_)` without making a copy.
However using the `pa.array()` constuctor currently does make a copy when
going bool -> int8. I think this would require a zero-copy casting kernel to be
added to C++. That seems like it would be a better approach, I just have to
wrap my head around that part of the code.
CC: @felipecrv does this sound right ^?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]