jorisvandenbossche commented on code in PR #43488:
URL: https://github.com/apache/arrow/pull/43488#discussion_r1723250627
##########
python/pyarrow/types.pxi:
##########
@@ -1836,6 +1836,35 @@ cdef class FixedShapeTensorType(BaseExtensionType):
def __arrow_ext_scalar_class__(self):
return FixedShapeTensorScalar
+cdef class Bool8Type(BaseExtensionType):
+ """
+ Concrete class for bool8 extension type.
+
+ Bool8 is an alternate representation for boolean
+ arrays using 8 bits instead of 1 bit per value. The underlying
+ storage type is int8.
+
+ Examples
+ --------
+ Create an instance of bool8 extension type:
+
+ >>> import pyarrow as pa
+ >>> pa.bool8()
+ Bool8Type(extension<arrow.bool8>)
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ BaseExtensionType.init(self, type)
+ self.bool8_ext_type = <const CBool8Type*> type.get()
+
+ def __arrow_ext_class__(self):
+ return Bool8Array
+
+ def __reduce__(self):
+ return bool8, ()
+
+ def __arrow_ext_scalar_class__(self):
+ return Bool8Scalar
Review Comment:
```suggestion
```
##########
python/pyarrow/array.pxi:
##########
@@ -4503,11 +4503,34 @@ cdef class Bool8Array(ExtensionArray):
"""
def to_numpy(self, zero_copy_only=True, writable=False):
- try:
- return self.storage.to_numpy().view(np.bool_)
- except ArrowInvalid as e:
- if zero_copy_only:
- raise e
+ """
+ Return a NumPy bool view or copy of this array (experimental).
Review Comment:
```suggestion
Return a NumPy bool view or copy of this array.
```
(I see you copied that from the original docstring, but that seems a label
that we can remove now)
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1707,3 +1707,149 @@ def test_opaque_type(pickle_module, storage_type,
storage):
# cast extension type -> storage type
inner = arr.cast(storage_type)
assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+ bool8_type = pa.bool8()
+ storage_type = pa.int8()
+ assert bool8_type.extension_name == "arrow.bool8"
+ assert bool8_type.storage_type == storage_type
+ assert str(bool8_type) == "extension<arrow.bool8>"
+
+ assert bool8_type == bool8_type
+ assert bool8_type == pa.bool8()
+ assert bool8_type != storage_type
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(bool8_type))
+ assert result == bool8_type
+
+ # IPC roundtrip
+ bool8_arr_class = bool8_type.__arrow_ext_class__()
+ storage = pa.array([-1, 0, 1, 2, None], storage_type)
+ arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+ assert isinstance(arr, bool8_arr_class)
+
+ # extension is registered by default
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+ batch = ipc_read_batch(buf)
+
+ assert batch.column(0).type.extension_name == "arrow.bool8"
+ assert isinstance(batch.column(0), bool8_arr_class)
+
+ # cast storage -> extension type
+ result = storage.cast(bool8_type)
+ assert result == arr
+
+ # cast extension type -> storage type
+ inner = arr.cast(storage_type)
+ assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+ bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+ bool8_arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cast extension type -> arrow boolean type
+ assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+ # cast arrow boolean type -> extension type, expecting canonical values
+ canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+ canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(),
canonical_storage)
+ assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+ arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cannot zero-copy with nulls
+ with pytest.raises(
+ pa.ArrowInvalid,
+ match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was
True",
+ ):
+ arr.to_numpy()
+
+ # nullable conversion possible with a copy, but dest dtype is object
+ assert np.array_equal(
+ arr.to_numpy(zero_copy_only=False),
+ np.array([True, False, True, True, None], dtype=np.object_),
+ )
+
+ # zero-copy possible with non-null array
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2], pa.int8()),
+ )
+
+ arr_to_np = arr_no_nulls.to_numpy()
+ assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+ # same underlying buffer
+ assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
+
+def test_bool8_from_numpy_conversion():
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([1, 0, 1, 1], pa.int8()),
+ )
+
+ arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls)
+ assert arr_from_np == canonical_bool8_arr_no_nulls
+
+ # same underlying buffer
+ assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+
+ # conversion only valid for 1-D arrays
+ with pytest.raises(
+ ValueError,
+ match="Cannot convert 2-D array to bool8 array",
+ ):
+ pa.Bool8Array.from_numpy(
+ np.array([[True, False], [False, True]], dtype=np.bool_),
+ )
+
+ with pytest.raises(
+ ValueError,
+ match="Cannot convert 0-D array to bool8 array",
+ ):
+ pa.Bool8Array.from_numpy(np.bool_())
+
+ # must use compatible storage type
+ with pytest.raises(
+ TypeError,
+ match="Array dtype float64 incompatible with bool8 storage",
+ ):
+ pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64))
+
+
+def test_bool8_scalar():
+ assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py()
Review Comment:
Something I didn't think about in the previous round, but it might be better
to test the value explicitly in this case, instead of relying on python's
general truthiness:
```suggestion
assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True
```
Because otherwise this test doesn't actually ensure that the result is
`True` or `False`. If we were still returning the underlying storage of 0, 1, 2
etc, those tests would also pass in its current form.
(same for the lines below)
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1707,3 +1707,149 @@ def test_opaque_type(pickle_module, storage_type,
storage):
# cast extension type -> storage type
inner = arr.cast(storage_type)
assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+ bool8_type = pa.bool8()
+ storage_type = pa.int8()
+ assert bool8_type.extension_name == "arrow.bool8"
+ assert bool8_type.storage_type == storage_type
+ assert str(bool8_type) == "extension<arrow.bool8>"
+
+ assert bool8_type == bool8_type
+ assert bool8_type == pa.bool8()
+ assert bool8_type != storage_type
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(bool8_type))
+ assert result == bool8_type
+
+ # IPC roundtrip
+ bool8_arr_class = bool8_type.__arrow_ext_class__()
+ storage = pa.array([-1, 0, 1, 2, None], storage_type)
+ arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+ assert isinstance(arr, bool8_arr_class)
Review Comment:
```suggestion
storage = pa.array([-1, 0, 1, 2, None], storage_type)
arr = pa.ExtensionArray.from_storage(bool8_type, storage)
assert isinstance(arr, pa.Bool8Array)
```
I would test it explicitly, because otherwise the test is essentially using
the same variable that is used by the implementation (assume someone
accidentally changed `Bool8Type.__arrow_ext_class__`, then the test wouldn't
necessarily catch this)
##########
python/pyarrow/types.pxi:
##########
@@ -1836,6 +1836,35 @@ cdef class FixedShapeTensorType(BaseExtensionType):
def __arrow_ext_scalar_class__(self):
return FixedShapeTensorScalar
Review Comment:
```suggestion
```
(small formatting nit: standard of two blank lines between class
definitions. But it seems this is not covered by the linting)
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1707,3 +1707,149 @@ def test_opaque_type(pickle_module, storage_type,
storage):
# cast extension type -> storage type
inner = arr.cast(storage_type)
assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+ bool8_type = pa.bool8()
+ storage_type = pa.int8()
+ assert bool8_type.extension_name == "arrow.bool8"
+ assert bool8_type.storage_type == storage_type
+ assert str(bool8_type) == "extension<arrow.bool8>"
+
+ assert bool8_type == bool8_type
+ assert bool8_type == pa.bool8()
+ assert bool8_type != storage_type
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(bool8_type))
+ assert result == bool8_type
+
+ # IPC roundtrip
+ bool8_arr_class = bool8_type.__arrow_ext_class__()
+ storage = pa.array([-1, 0, 1, 2, None], storage_type)
+ arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+ assert isinstance(arr, bool8_arr_class)
+
+ # extension is registered by default
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+ batch = ipc_read_batch(buf)
+
+ assert batch.column(0).type.extension_name == "arrow.bool8"
+ assert isinstance(batch.column(0), bool8_arr_class)
+
+ # cast storage -> extension type
+ result = storage.cast(bool8_type)
+ assert result == arr
+
+ # cast extension type -> storage type
+ inner = arr.cast(storage_type)
+ assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+ bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+ bool8_arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cast extension type -> arrow boolean type
+ assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+ # cast arrow boolean type -> extension type, expecting canonical values
+ canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+ canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(),
canonical_storage)
+ assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+ arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cannot zero-copy with nulls
+ with pytest.raises(
+ pa.ArrowInvalid,
+ match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was
True",
+ ):
+ arr.to_numpy()
+
+ # nullable conversion possible with a copy, but dest dtype is object
+ assert np.array_equal(
+ arr.to_numpy(zero_copy_only=False),
+ np.array([True, False, True, True, None], dtype=np.object_),
+ )
+
+ # zero-copy possible with non-null array
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2], pa.int8()),
+ )
+
+ arr_to_np = arr_no_nulls.to_numpy()
+ assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+ # same underlying buffer
+ assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
Review Comment:
Maybe add the case with `arr_to_np = arr_no_nulls.to_numpy(writable=True)`
that in this case the buffers are not the same (as I saw in the latest diff you
explicitly added support for the `writable` keyword)
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1707,3 +1707,149 @@ def test_opaque_type(pickle_module, storage_type,
storage):
# cast extension type -> storage type
inner = arr.cast(storage_type)
assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+ bool8_type = pa.bool8()
+ storage_type = pa.int8()
+ assert bool8_type.extension_name == "arrow.bool8"
+ assert bool8_type.storage_type == storage_type
+ assert str(bool8_type) == "extension<arrow.bool8>"
+
+ assert bool8_type == bool8_type
+ assert bool8_type == pa.bool8()
+ assert bool8_type != storage_type
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(bool8_type))
+ assert result == bool8_type
+
+ # IPC roundtrip
+ bool8_arr_class = bool8_type.__arrow_ext_class__()
+ storage = pa.array([-1, 0, 1, 2, None], storage_type)
+ arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+ assert isinstance(arr, bool8_arr_class)
+
+ # extension is registered by default
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+ batch = ipc_read_batch(buf)
+
+ assert batch.column(0).type.extension_name == "arrow.bool8"
+ assert isinstance(batch.column(0), bool8_arr_class)
Review Comment:
```suggestion
assert isinstance(batch.column(0), pa.Bool8Array)
```
##########
python/pyarrow/types.pxi:
##########
@@ -5333,7 +5334,7 @@ def bool8():
pyarrow.Table
unknown_col: extension<arrow.bool8>
----
- unknown_col: [[True, False, True, True, null]]
+ unknown_col: [[-1,0,1,2,null]]
Review Comment:
Sidenote: this is a good illustration for that we should ideally have a way
to let the extension type control this string representation
##########
python/pyarrow/types.pxi:
##########
@@ -5278,6 +5307,50 @@ def fixed_shape_tensor(DataType value_type, shape,
dim_names=None, permutation=N
return out
+def bool8():
+ """
+ Create instance of bool8 extension type.
+
+ Examples
+ --------
+ Create an instance of bool8 extension type:
+
+ >>> import pyarrow as pa
+ >>> type = pa.bool8()
+ >>> type
+ Bool8Type(extension<arrow.bool8>)
+
+ Inspect the data type:
+
+ >>> type.storage_type
+ DataType(int8)
+
+ Create a table with a bool8 array:
+
+ >>> arr = [-1, 0, 1, 2, None]
+ >>> storage = pa.array(arr, pa.int8())
+ >>> other = pa.ExtensionArray.from_storage(type, storage)
+ >>> pa.table([other], names=["unknown_col"])
+ pyarrow.Table
+ unknown_col: extension<arrow.bool8>
+ ----
+ unknown_col: [[-1,0,1,2,null]]
+
+ Returns
+ -------
+ type : Bool8Type
+ """
+
+ cdef Bool8Type out = Bool8Type.__new__(Bool8Type)
+
+ with nogil:
+ c_type = GetResultValue(CBool8Type.Make())
+
Review Comment:
```suggestion
c_type = GetResultValue(CBool8Type.Make())
```
This should be a very fast, trivial function call, so I don't think it will
be useful to release the GIL
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1661,3 +1661,133 @@ def test_legacy_int_type():
batch = ipc_read_batch(buf)
assert isinstance(batch.column(0).type, LegacyIntType)
assert batch.column(0) == ext_arr
+
+
+def test_bool8_type(pickle_module):
+ bool8_type = pa.bool8()
+ storage_type = pa.int8()
+ assert bool8_type.extension_name == "arrow.bool8"
+ assert bool8_type.storage_type == storage_type
+ assert str(bool8_type) == "extension<arrow.bool8>"
+
+ assert bool8_type == bool8_type
+ assert bool8_type == pa.bool8()
+ assert bool8_type != storage_type
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(bool8_type))
+ assert result == bool8_type
+
+ # IPC roundtrip
+ bool8_arr_class = bool8_type.__arrow_ext_class__()
+ storage = pa.array([-1, 0, 1, 2, None], storage_type)
+ arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+ assert isinstance(arr, bool8_arr_class)
+
+ with registered_extension_type(bool8_type):
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+ batch = ipc_read_batch(buf)
+
+ assert batch.column(0).type.extension_name == "arrow.bool8"
+ assert isinstance(batch.column(0), bool8_arr_class)
+
+ # cast storage -> extension type
+ result = storage.cast(bool8_type)
+ assert result == arr
+
+ # cast extension type -> storage type
+ inner = arr.cast(storage_type)
+ assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+ bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+ bool8_arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cast extension type -> arrow boolean type
+ assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+ # cast arrow boolean type -> extension type, expecting canonical values
+ canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+ canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(),
canonical_storage)
+ assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+ arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cannot zero-copy with nulls
+ with pytest.raises(
+ pa.ArrowInvalid,
+ match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was
True",
+ ):
+ arr.to_numpy()
+
+ # nullable conversion possible with a copy, but dest dtype is object
+ assert np.array_equal(
+ arr.to_numpy(zero_copy_only=False),
+ np.array([True, False, True, True, None], dtype=np.object_),
+ )
+
+ # zero-copy possible with non-null array
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2], pa.int8()),
+ )
+
+ arr_to_np = arr_no_nulls.to_numpy()
+ assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+ # same underlying buffer
+ assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
+
+def test_bool8_from_numpy_conversion():
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([1, 0, 1, 1], pa.int8()),
+ )
+
+ arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls)
+ assert arr_from_np == canonical_bool8_arr_no_nulls
+
+ # same underlying buffer
+ assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+
+ # conversion only valid for 1-D arrays
+ with pytest.raises(
+ ValueError,
+ match="Cannot convert 2-D array to bool8 array",
+ ):
+ pa.Bool8Array.from_numpy(
+ np.array([[True, False], [False, True]], dtype=np.bool_),
+ )
+
+ with pytest.raises(
+ ValueError,
+ match="Cannot convert 0-D array to bool8 array",
+ ):
+ pa.Bool8Array.from_numpy(np.bool_())
+
+ # must use compatible storage type
+ with pytest.raises(
+ TypeError,
+ match="Array dtype float64 incompatible with bool8 storage",
+ ):
+ pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64))
+
+
+def test_bool8_scalar():
+ assert not pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py()
Review Comment:
Thanks for adding that support!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]