jorisvandenbossche commented on code in PR #42113:
URL: https://github.com/apache/arrow/pull/42113#discussion_r1635931191
##########
python/pyarrow/array.pxi:
##########
@@ -1210,20 +1218,22 @@ cdef class Array(_PandasConvertible):
If a buffer is referenced multiple times then it will
only be counted once.
"""
- cdef:
- int64_t total_buffer_size
-
+ self._assert_cpu()
+ cdef int64_t total_buffer_size
total_buffer_size = TotalBufferSize(deref(self.ap))
return total_buffer_size
def __sizeof__(self):
+ self._assert_cpu()
return super(Array, self).__sizeof__() + self.nbytes
def __iter__(self):
+ self._assert_cpu()
for i in range(len(self)):
yield self.getitem(i)
def __repr__(self):
+ self._assert_cpu()
Review Comment:
This one should not be needed because this ends up calling `to_string` which
already has it, I think
##########
python/pyarrow/array.pxi:
##########
@@ -1404,8 +1424,9 @@ cdef class Array(_PandasConvertible):
-------
sliced : RecordBatch
"""
- cdef:
- shared_ptr[CArray] result
+ self._assert_cpu()
Review Comment:
I _think_ that slicing actually works?
##########
python/pyarrow/array.pxi:
##########
@@ -1307,6 +1319,8 @@ cdef class Array(_PandasConvertible):
-------
bool
"""
+ self._assert_cpu()
+ self.other._assert_cpu()
Review Comment:
```suggestion
other._assert_cpu()
```
##########
python/pyarrow/array.pxi:
##########
@@ -1883,6 +1917,28 @@ cdef class Array(_PandasConvertible):
device = GetResultValue(ExportDevice(self.sp_array))
return device.device_type, device.device_id
+ @property
+ def device_type(self):
+ """
+ The device type where the array resides.
+
+ Returns
+ -------
+ DeviceAllocationType
+ """
+ return _wrap_device_allocation_type(self.sp_array.get().device_type())
+
+ @property
+ def is_cpu(self):
+ """
+ Whether the array is CPU-accessible.
+ """
+ return self.device_type == DeviceAllocationType.CPU
+
+ def _assert_cpu(self):
+ if not self.is_cpu:
Review Comment:
If we would want to speed this up a bit (it's called in many places,
although typically should only give tiny overhead), you could also inline
similar code as you used above for the python attributes, like `if
self.sp_array.get().device_type() != CDeviceAllocationType_kCPU: ..`, and make
it a `cdef` instead of `def` (not entirely sure this is worth it)
##########
python/pyarrow/tests/test_array.py:
##########
@@ -3998,3 +3998,89 @@ def test_swapped_byte_order_fails(numpy_native_dtype):
# Struct type array
with pytest.raises(pa.ArrowNotImplementedError):
pa.StructArray.from_arrays([np_arr], names=['a'])
+
+
+def test_non_cpu_array():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ ctx = cuda.Context(0)
+
+ data = np.arange(4, dtype=np.int32)
+ validity = np.array([True, False, True, False], dtype=np.bool_)
+ cuda_data_buf = ctx.buffer_from_data(data)
+ cuda_validity_buf = ctx.buffer_from_data(validity)
+ arr = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr2 = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr_with_nulls = pa.Array.from_buffers(
+ pa.int32(), 4, [cuda_validity_buf, cuda_data_buf])
+
+ # Supported
+ arr.validate()
+ arr.validate(full=True)
+ assert arr.offset() == 0
+ assert arr.buffers() == [None, cuda_data_buf]
+ assert arr.device_type == pa.DeviceAllocationType.CUDA
+ assert arr.is_cpu is False
+
+ # Not Supported
+ with pytest.raises(NotImplementedError):
+ arr.diff(arr2)
+ with pytest.raises(NotImplementedError):
+ arr.cast(pa.int64())
+ with pytest.raises(NotImplementedError):
+ arr.view(pa.int64())
+ with pytest.raises(NotImplementedError):
+ arr.sum()
+ with pytest.raises(NotImplementedError):
+ arr.unique()
+ with pytest.raises(NotImplementedError):
+ arr.dictionary_encode()
+ with pytest.raises(NotImplementedError):
+ arr.value_counts()
+ with pytest.raises(NotImplementedError):
+ arr_with_nulls.null_count
+ with pytest.raises(NotImplementedError):
+ arr.nbytes
+ with pytest.raises(NotImplementedError):
+ arr.get_total_buffer_size()
+ with pytest.raises(NotImplementedError):
+ [i for i in iter(arr)]
+ with pytest.raises(NotImplementedError):
+ repr(arr)
+ with pytest.raises(NotImplementedError):
+ str(arr)
+ with pytest.raises(NotImplementedError):
+ arr == arr2
+ with pytest.raises(NotImplementedError):
+ arr.is_null()
+ with pytest.raises(NotImplementedError):
+ arr.is_nan()
+ with pytest.raises(NotImplementedError):
+ arr.is_valid()
+ with pytest.raises(NotImplementedError):
+ arr.fill_null(0)
+ with pytest.raises(NotImplementedError):
+ arr.__getitem__(0)
+ with pytest.raises(NotImplementedError):
+ arr.getitem(0)
Review Comment:
This one can be removed (`getitem` is a cdef function which is not callable
from python, so this will error for that reason), this is tested through
testing `__getitem__`
##########
python/pyarrow/tests/test_array.py:
##########
@@ -3998,3 +3998,89 @@ def test_swapped_byte_order_fails(numpy_native_dtype):
# Struct type array
with pytest.raises(pa.ArrowNotImplementedError):
pa.StructArray.from_arrays([np_arr], names=['a'])
+
+
+def test_non_cpu_array():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ ctx = cuda.Context(0)
+
+ data = np.arange(4, dtype=np.int32)
+ validity = np.array([True, False, True, False], dtype=np.bool_)
+ cuda_data_buf = ctx.buffer_from_data(data)
+ cuda_validity_buf = ctx.buffer_from_data(validity)
+ arr = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr2 = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr_with_nulls = pa.Array.from_buffers(
+ pa.int32(), 4, [cuda_validity_buf, cuda_data_buf])
+
+ # Supported
+ arr.validate()
+ arr.validate(full=True)
+ assert arr.offset() == 0
+ assert arr.buffers() == [None, cuda_data_buf]
+ assert arr.device_type == pa.DeviceAllocationType.CUDA
+ assert arr.is_cpu is False
+
Review Comment:
```suggestion
assert len(arr) == 4
```
##########
python/pyarrow/array.pxi:
##########
@@ -1210,20 +1218,22 @@ cdef class Array(_PandasConvertible):
If a buffer is referenced multiple times then it will
only be counted once.
"""
- cdef:
- int64_t total_buffer_size
-
+ self._assert_cpu()
+ cdef int64_t total_buffer_size
total_buffer_size = TotalBufferSize(deref(self.ap))
Review Comment:
My first thought was that getting the buffer size should be possible without
looking at the actual data. But so it seems that to avoid counting identical
buffers twice (if they are reused in a single array, which is possible), it
uses the buffer's address to distinguish buffers. Currently that uses
`buffer->data()` in `DoTotalBufferSize`. `data()` will return null for non-cpu
data, but since it doesn't actually use that address, I think this could also
use `buffer->address()` which will return the address always even for non-cpu
data.
(but that could be fine for a follow-up as well)
##########
python/pyarrow/tests/test_array.py:
##########
@@ -3998,3 +3998,89 @@ def test_swapped_byte_order_fails(numpy_native_dtype):
# Struct type array
with pytest.raises(pa.ArrowNotImplementedError):
pa.StructArray.from_arrays([np_arr], names=['a'])
+
+
+def test_non_cpu_array():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ ctx = cuda.Context(0)
+
+ data = np.arange(4, dtype=np.int32)
+ validity = np.array([True, False, True, False], dtype=np.bool_)
+ cuda_data_buf = ctx.buffer_from_data(data)
+ cuda_validity_buf = ctx.buffer_from_data(validity)
+ arr = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr2 = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr_with_nulls = pa.Array.from_buffers(
+ pa.int32(), 4, [cuda_validity_buf, cuda_data_buf])
+
+ # Supported
+ arr.validate()
+ arr.validate(full=True)
Review Comment:
I am wondering if this will actually be supported for more complex data
types. For example for variable size binary, it will check the actual offsets
if they are correct numbers (eg not negative, not out of bounds, increasing,
etc)
##########
python/pyarrow/tests/test_array.py:
##########
@@ -3998,3 +3998,89 @@ def test_swapped_byte_order_fails(numpy_native_dtype):
# Struct type array
with pytest.raises(pa.ArrowNotImplementedError):
pa.StructArray.from_arrays([np_arr], names=['a'])
+
+
+def test_non_cpu_array():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ ctx = cuda.Context(0)
+
+ data = np.arange(4, dtype=np.int32)
+ validity = np.array([True, False, True, False], dtype=np.bool_)
+ cuda_data_buf = ctx.buffer_from_data(data)
+ cuda_validity_buf = ctx.buffer_from_data(validity)
+ arr = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr2 = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr_with_nulls = pa.Array.from_buffers(
+ pa.int32(), 4, [cuda_validity_buf, cuda_data_buf])
+
+ # Supported
+ arr.validate()
+ arr.validate(full=True)
+ assert arr.offset() == 0
+ assert arr.buffers() == [None, cuda_data_buf]
+ assert arr.device_type == pa.DeviceAllocationType.CUDA
+ assert arr.is_cpu is False
+
+ # Not Supported
+ with pytest.raises(NotImplementedError):
+ arr.diff(arr2)
+ with pytest.raises(NotImplementedError):
+ arr.cast(pa.int64())
+ with pytest.raises(NotImplementedError):
+ arr.view(pa.int64())
+ with pytest.raises(NotImplementedError):
+ arr.sum()
+ with pytest.raises(NotImplementedError):
+ arr.unique()
+ with pytest.raises(NotImplementedError):
+ arr.dictionary_encode()
+ with pytest.raises(NotImplementedError):
+ arr.value_counts()
+ with pytest.raises(NotImplementedError):
+ arr_with_nulls.null_count
+ with pytest.raises(NotImplementedError):
+ arr.nbytes
+ with pytest.raises(NotImplementedError):
+ arr.get_total_buffer_size()
+ with pytest.raises(NotImplementedError):
+ [i for i in iter(arr)]
+ with pytest.raises(NotImplementedError):
+ repr(arr)
+ with pytest.raises(NotImplementedError):
+ str(arr)
+ with pytest.raises(NotImplementedError):
+ arr == arr2
+ with pytest.raises(NotImplementedError):
+ arr.is_null()
+ with pytest.raises(NotImplementedError):
+ arr.is_nan()
+ with pytest.raises(NotImplementedError):
+ arr.is_valid()
+ with pytest.raises(NotImplementedError):
+ arr.fill_null(0)
+ with pytest.raises(NotImplementedError):
+ arr.__getitem__(0)
+ with pytest.raises(NotImplementedError):
+ arr.getitem(0)
+ with pytest.raises(NotImplementedError):
+ arr.slice()
+ with pytest.raises(NotImplementedError):
+ arr.take([0])
+ with pytest.raises(NotImplementedError):
+ arr.drop_null()
+ with pytest.raises(NotImplementedError):
+ arr.filter([True, True, False, False])
+ with pytest.raises(NotImplementedError):
+ arr.index(0)
+ with pytest.raises(NotImplementedError):
+ arr.sort()
+ with pytest.raises(NotImplementedError):
+ arr.__array__()
+ with pytest.raises(NotImplementedError):
+ arr.to_numpy()
+ with pytest.raises(NotImplementedError):
+ arr.to_list()
+ with pytest.raises(NotImplementedError):
+ arr.__dlpack__()
+ with pytest.raises(NotImplementedError):
+ arr.__dlpack_device__()
Review Comment:
Maybe add a TODO comment here that this should be supported in the future
##########
python/pyarrow/tests/test_array.py:
##########
@@ -3998,3 +3998,89 @@ def test_swapped_byte_order_fails(numpy_native_dtype):
# Struct type array
with pytest.raises(pa.ArrowNotImplementedError):
pa.StructArray.from_arrays([np_arr], names=['a'])
+
+
+def test_non_cpu_array():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ ctx = cuda.Context(0)
+
+ data = np.arange(4, dtype=np.int32)
+ validity = np.array([True, False, True, False], dtype=np.bool_)
+ cuda_data_buf = ctx.buffer_from_data(data)
+ cuda_validity_buf = ctx.buffer_from_data(validity)
+ arr = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr2 = pa.Array.from_buffers(pa.int32(), 4, [None, cuda_data_buf])
+ arr_with_nulls = pa.Array.from_buffers(
+ pa.int32(), 4, [cuda_validity_buf, cuda_data_buf])
+
+ # Supported
+ arr.validate()
+ arr.validate(full=True)
+ assert arr.offset() == 0
+ assert arr.buffers() == [None, cuda_data_buf]
+ assert arr.device_type == pa.DeviceAllocationType.CUDA
+ assert arr.is_cpu is False
+
+ # Not Supported
+ with pytest.raises(NotImplementedError):
+ arr.diff(arr2)
+ with pytest.raises(NotImplementedError):
+ arr.cast(pa.int64())
+ with pytest.raises(NotImplementedError):
+ arr.view(pa.int64())
+ with pytest.raises(NotImplementedError):
+ arr.sum()
+ with pytest.raises(NotImplementedError):
+ arr.unique()
+ with pytest.raises(NotImplementedError):
+ arr.dictionary_encode()
+ with pytest.raises(NotImplementedError):
+ arr.value_counts()
+ with pytest.raises(NotImplementedError):
+ arr_with_nulls.null_count
+ with pytest.raises(NotImplementedError):
+ arr.nbytes
+ with pytest.raises(NotImplementedError):
+ arr.get_total_buffer_size()
+ with pytest.raises(NotImplementedError):
+ [i for i in iter(arr)]
+ with pytest.raises(NotImplementedError):
+ repr(arr)
+ with pytest.raises(NotImplementedError):
+ str(arr)
+ with pytest.raises(NotImplementedError):
+ arr == arr2
+ with pytest.raises(NotImplementedError):
+ arr.is_null()
+ with pytest.raises(NotImplementedError):
+ arr.is_nan()
+ with pytest.raises(NotImplementedError):
+ arr.is_valid()
+ with pytest.raises(NotImplementedError):
+ arr.fill_null(0)
+ with pytest.raises(NotImplementedError):
+ arr.__getitem__(0)
Review Comment:
```suggestion
arr[0]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]