AlenkaF commented on code in PR #40354:
URL: https://github.com/apache/arrow/pull/40354#discussion_r3084799518
##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -3014,6 +3027,24 @@ cdef extern from "arrow/extension_type.h" namespace
"arrow":
shared_ptr[CArray] storage()
+cdef extern from "arrow/extension/variable_shape_tensor.h" namespace
"arrow::extension" nogil:
Review Comment:
Could this be moved little lower, after fixed shape tensor ref?
##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -3034,7 +3065,7 @@ cdef extern from "arrow/extension/uuid.h" namespace
"arrow::extension" nogil:
cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace
"arrow::extension" nogil:
cdef cppclass CFixedShapeTensorType \
- " arrow::extension::FixedShapeTensorType"(CExtensionType):
+ " arrow::extension::FixedShapeTensorType"(CExtensionType) nogil:
Review Comment:
Is there a reason `nogil` is added here and used in the variable version
also?
##########
python/pyarrow/array.pxi:
##########
@@ -4955,6 +4945,321 @@ cdef class Bool8Array(ExtensionArray):
return Bool8Array.from_storage(storage_arr)
+def _check_sequence_param(value, ndim, name):
+ if value is None:
+ return False
+ if not isinstance(value, Sequence):
+ raise TypeError(f"{name} must be a tuple or list")
+ if len(value) != ndim:
+ raise ValueError(
+ (f"The length of {name} ({len(value)}) does not match"
+ f" the number of tensor dimensions ({ndim})."))
+ return True
+
+
+def _validate_dim_names(dim_names, ndim):
+ if not _check_sequence_param(dim_names, ndim, "dim_names"):
+ return
+ if not all(isinstance(name, str) for name in dim_names):
+ raise TypeError("Each element of dim_names must be a string")
+
+
+def _validate_permutation(permutation, ndim):
+ if not _check_sequence_param(permutation, ndim, "permutation"):
+ return None
+ normalized = [int(x) for x in permutation]
+ if sorted(normalized) != list(range(ndim)):
+ raise ValueError(
+ "permutation must contain each dimension index exactly once")
+ return normalized
+
+
+def _validate_uniform_shape(uniform_shape, ndim):
+ if not _check_sequence_param(uniform_shape, ndim, "uniform_shape"):
+ return
+ for value in uniform_shape:
+ if value is not None and value < 0:
+ raise ValueError(
+ "uniform_shape must contain non-negative values")
+
+
+def _infer_uniform_shape(shape_rows, ndim):
+ if len(shape_rows) == 0:
+ return None
+ inferred = []
+ for i in range(ndim):
+ axis_size = shape_rows[0][i]
+ if all(shape[i] == axis_size for shape in shape_rows):
+ inferred.append(axis_size)
+ else:
+ inferred.append(None)
+ if all(x is None for x in inferred):
+ return None
+ return inferred
+
+
+def _permutation_from_strides(arr):
+ """Infer the dimension permutation from array strides.
+
+ Note: for arrays with size-1 dimensions, the inferred permutation
+ may be unreliable since size-1 strides are unconstrained. Callers
+ should skip permutation validation for such arrays.
+ """
+ return [int(x) for x in
+ (-np.array(arr.strides, dtype=np.int64)).argsort(kind="stable")]
+
+
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64()))
+ >>> arr = pa.StructArray.from_arrays([values, shapes], names=["data",
"shape"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: list<item: double>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ -- child 1 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ """
+
+ @staticmethod
+ def from_numpy_ndarray(obj, dim_names=None, permutation=None,
uniform_shape=None,
+ value_type=None, ndim=None):
+ """
+ Convert a sequence of numpy.ndarrays to a variable shape tensor
extension array.
+ The length of the input sequence becomes the length of the output
array.
+
+ Parameters
+ ----------
+ obj : Sequence[numpy.ndarray]
+ Sequence of ndarrays with matching dtype, ndim, and memory
permutation.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list of integers, default None
+ Physical permutation for all input arrays. If None, inferred from
strides.
+ uniform_shape : tuple or list of integers or None, default None
+ Optional known uniform dimensions in physical order. If None,
inferred.
+ value_type : pyarrow.DataType or numpy dtype, default None
+ Optional explicit tensor value type. Required with empty input.
+ ndim : int, default None
+ Optional explicit tensor rank. Required with empty input.
+
+ Returns
+ -------
+ VariableShapeTensorArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> arrays = [np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32),
+ ... np.array([[7, 8]], dtype=np.int32)]
+ >>> pa.VariableShapeTensorArray.from_numpy_ndarray(arrays)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ ...
+ """
+ cdef:
+ list arrays
+ list shape_rows
+ int array_ndim
+ int i
+ object base_dtype
+ DataType arrow_type
+ list normalized_permutation
+ list permutation_metadata
+ DataType shape_type
+ Array values
+ Array shapes
+ StructArray struct_arr
+ VariableShapeTensorType ext_type
+
+ if isinstance(obj, np.ndarray):
+ raise TypeError("obj must be a sequence of numpy arrays")
Review Comment:
I think the first check falls into the second one and could be omitted?
##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -1184,6 +1192,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
void set_chunksize(int64_t chunksize)
cdef cppclass CTensor" arrow::Tensor":
+ CTensor(const shared_ptr[CDataType]& type,
Review Comment:
Here also, I am not sure if we actually need this?
##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -908,6 +908,14 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
const shared_ptr[CBuffer] null_bitmap,
)
+ @staticmethod
Review Comment:
I am not sure where this is needed?
##########
python/pyarrow/array.pxi:
##########
@@ -4955,6 +4945,321 @@ cdef class Bool8Array(ExtensionArray):
return Bool8Array.from_storage(storage_arr)
+def _check_sequence_param(value, ndim, name):
+ if value is None:
+ return False
+ if not isinstance(value, Sequence):
+ raise TypeError(f"{name} must be a tuple or list")
+ if len(value) != ndim:
+ raise ValueError(
+ (f"The length of {name} ({len(value)}) does not match"
+ f" the number of tensor dimensions ({ndim})."))
+ return True
+
+
+def _validate_dim_names(dim_names, ndim):
+ if not _check_sequence_param(dim_names, ndim, "dim_names"):
+ return
+ if not all(isinstance(name, str) for name in dim_names):
+ raise TypeError("Each element of dim_names must be a string")
+
+
+def _validate_permutation(permutation, ndim):
+ if not _check_sequence_param(permutation, ndim, "permutation"):
+ return None
+ normalized = [int(x) for x in permutation]
+ if sorted(normalized) != list(range(ndim)):
+ raise ValueError(
+ "permutation must contain each dimension index exactly once")
+ return normalized
+
+
+def _validate_uniform_shape(uniform_shape, ndim):
+ if not _check_sequence_param(uniform_shape, ndim, "uniform_shape"):
+ return
+ for value in uniform_shape:
+ if value is not None and value < 0:
+ raise ValueError(
+ "uniform_shape must contain non-negative values")
+
+
+def _infer_uniform_shape(shape_rows, ndim):
+ if len(shape_rows) == 0:
+ return None
+ inferred = []
+ for i in range(ndim):
+ axis_size = shape_rows[0][i]
+ if all(shape[i] == axis_size for shape in shape_rows):
+ inferred.append(axis_size)
+ else:
+ inferred.append(None)
+ if all(x is None for x in inferred):
+ return None
+ return inferred
+
+
+def _permutation_from_strides(arr):
+ """Infer the dimension permutation from array strides.
+
+ Note: for arrays with size-1 dimensions, the inferred permutation
+ may be unreliable since size-1 strides are unconstrained. Callers
+ should skip permutation validation for such arrays.
+ """
+ return [int(x) for x in
+ (-np.array(arr.strides, dtype=np.int64)).argsort(kind="stable")]
+
+
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64()))
+ >>> arr = pa.StructArray.from_arrays([values, shapes], names=["data",
"shape"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: list<item: double>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ -- child 1 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ """
+
+ @staticmethod
+ def from_numpy_ndarray(obj, dim_names=None, permutation=None,
uniform_shape=None,
+ value_type=None, ndim=None):
+ """
+ Convert a sequence of numpy.ndarrays to a variable shape tensor
extension array.
+ The length of the input sequence becomes the length of the output
array.
+
+ Parameters
+ ----------
+ obj : Sequence[numpy.ndarray]
+ Sequence of ndarrays with matching dtype, ndim, and memory
permutation.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list of integers, default None
+ Physical permutation for all input arrays. If None, inferred from
strides.
+ uniform_shape : tuple or list of integers or None, default None
+ Optional known uniform dimensions in physical order. If None,
inferred.
+ value_type : pyarrow.DataType or numpy dtype, default None
+ Optional explicit tensor value type. Required with empty input.
+ ndim : int, default None
+ Optional explicit tensor rank. Required with empty input.
+
+ Returns
+ -------
+ VariableShapeTensorArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> arrays = [np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32),
+ ... np.array([[7, 8]], dtype=np.int32)]
+ >>> pa.VariableShapeTensorArray.from_numpy_ndarray(arrays)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ ...
+ """
+ cdef:
+ list arrays
+ list shape_rows
+ int array_ndim
+ int i
+ object base_dtype
+ DataType arrow_type
+ list normalized_permutation
+ list permutation_metadata
+ DataType shape_type
+ Array values
+ Array shapes
+ StructArray struct_arr
+ VariableShapeTensorType ext_type
+
+ if isinstance(obj, np.ndarray):
+ raise TypeError("obj must be a sequence of numpy arrays")
+ if not isinstance(obj, Sequence) or isinstance(obj, (str, bytes)):
+ raise TypeError("obj must be a sequence of numpy arrays")
+ arrays = list(obj)
+
+ if value_type is not None and not isinstance(value_type, DataType):
+ try:
+ value_type = from_numpy_dtype(np.dtype(value_type))
+ except (TypeError, ValueError) as exc:
+ raise TypeError(
+ "value_type must be a pyarrow.DataType or numpy dtype"
+ ) from exc
+
+ if len(arrays) == 0:
+ if value_type is None or ndim is None:
+ raise ValueError(
+ "For empty input, both value_type and ndim must be
provided")
+ if ndim < 0:
+ raise ValueError("ndim must be non-negative")
+
+ _validate_dim_names(dim_names, ndim)
+ permutation = _validate_permutation(permutation, ndim)
+ _validate_uniform_shape(uniform_shape, ndim)
+
+ shape_type = list_(int32(), list_size=ndim)
+ values = array([], list_(value_type))
+ shapes = array([], shape_type)
+ struct_arr = StructArray.from_arrays(
+ [values, shapes], names=["data", "shape"])
+ ext_type = variable_shape_tensor(
+ value_type,
+ ndim,
+ dim_names=dim_names,
+ permutation=permutation,
+ uniform_shape=uniform_shape
+ )
+ return ExtensionArray.from_storage(ext_type, struct_arr)
+
+ for i, arr in enumerate(arrays):
+ if not isinstance(arr, np.ndarray):
+ raise TypeError(f"obj[{i}] must be a numpy.ndarray")
+ if arr.ndim == 0:
+ raise ValueError("Cannot convert scalar to variable shape
tensor array")
+
+ base_dtype = arrays[0].dtype
+ array_ndim = arrays[0].ndim
+ arrow_type = from_numpy_dtype(base_dtype)
+
+ if value_type is not None and value_type != arrow_type:
Review Comment:
Do we need to accept `value_type` if the input is not empty as it has to be
same as the base type anyways? Not sure what are other options, but feels a bit
funny as we are giving this option but it then has to be specific.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]