This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d21c1c79c0 GH-34882: [Python] Binding for FixedShapeTensorType (#34883)
d21c1c79c0 is described below
commit d21c1c79c0d6377d035bd90da5d5d09e04e49079
Author: Alenka Frim <[email protected]>
AuthorDate: Tue Apr 11 16:37:03 2023 +0200
GH-34882: [Python] Binding for FixedShapeTensorType (#34883)
### Rationale for this change
In the C++ the fixed shape tensor canonical extension type is implementated
https://github.com/apache/arrow/pull/8510 so we can add bindings to the
extension type in Python.
### What changes are included in this PR?
Binding for fixed shape tensor canonical extension type.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* Closes: #34882
Lead-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Co-authored-by: Rok Mihevc <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
docs/source/format/CanonicalExtensions.rst | 2 +
python/pyarrow/__init__.py | 5 +-
python/pyarrow/array.pxi | 109 +++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 21 ++++
python/pyarrow/lib.pxd | 5 +
python/pyarrow/public-api.pxi | 2 +
python/pyarrow/tests/test_extension_type.py | 96 +++++++++++++++
python/pyarrow/types.pxi | 180 ++++++++++++++++++++++++++++
8 files changed, 418 insertions(+), 2 deletions(-)
diff --git a/docs/source/format/CanonicalExtensions.rst
b/docs/source/format/CanonicalExtensions.rst
index 92dc1b2db9..5dd269ee5c 100644
--- a/docs/source/format/CanonicalExtensions.rst
+++ b/docs/source/format/CanonicalExtensions.rst
@@ -72,6 +72,8 @@ same rules as laid out above, and provide backwards
compatibility guarantees.
Official List
=============
+.. _fixed_shape_tensor_extension:
+
Fixed shape tensor
==================
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 6ed2df080d..ecbce5c4d9 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -170,6 +170,7 @@ from pyarrow.lib import (null, bool_,
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
+ fixed_shape_tensor,
field,
type_for_alias,
DataType, DictionaryType, StructType,
@@ -178,7 +179,7 @@ from pyarrow.lib import (null, bool_,
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
- RunEndEncodedType,
+ RunEndEncodedType, FixedShapeTensorType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
@@ -209,7 +210,7 @@ from pyarrow.lib import (null, bool_,
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray,
ExtensionArray,
- RunEndEncodedArray,
+ RunEndEncodedArray, FixedShapeTensorArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 715c0aced6..11f10dddef 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -3075,6 +3075,115 @@ cdef class ExtensionArray(Array):
return Array._to_pandas(self.storage, options, **kwargs)
+class FixedShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for fixed shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+
+ Create an extension array
+
+ >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ >>> pa.ExtensionArray.from_storage(tensor_type, storage)
+ <pyarrow.lib.FixedShapeTensorArray object at ...>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4
+ ],
+ [
+ 10,
+ 20,
+ 30,
+ 40
+ ],
+ [
+ 100,
+ 200,
+ 300,
+ 400
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert fixed shape tensor extension array to a numpy array (with
dim+1).
+
+ Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ...,
len(shape)-1]``).
+ """
+ if self.type.permutation is None or self.type.permutation ==
list(range(len(self.type.shape))):
+ np_flat = np.asarray(self.storage.values)
+ numpy_tensor = np_flat.reshape((len(self),) +
tuple(self.type.shape))
+ return numpy_tensor
+ else:
+ raise ValueError(
+ 'Only non-permuted tensors can be converted to numpy tensors.')
+
+ @staticmethod
+ def from_numpy_ndarray(obj):
+ """
+ Convert numpy tensors (ndarrays) to a fixed shape tensor extension
array.
+ The first dimension of ndarray will become the length of the fixed
+ shape tensor array.
+
+ Numpy array needs to be C-contiguous in memory
+ (``obj.flags["C_CONTIGUOUS"]==True``).
+
+ Parameters
+ ----------
+ obj : numpy.ndarray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> arr = np.array(
+ ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+ ... dtype=np.float32)
+ >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+ <pyarrow.lib.FixedShapeTensorArray object at ...>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ]
+ ]
+ """
+ if not obj.flags["C_CONTIGUOUS"]:
+ raise ValueError('The data in the numpy array need to be in a
single, '
+ 'C-style contiguous segment.')
+
+ arrow_type = from_numpy_dtype(obj.dtype)
+ shape = obj.shape[1:]
+ size = obj.size / obj.shape[0]
+
+ return ExtensionArray.from_storage(
+ fixed_shape_tensor(arrow_type, shape),
+ FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size)
+ )
+
+
cdef dict _array_classes = {
_Type_NA: NullArray,
_Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 19c9bdc2da..8fc531dff0 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2622,6 +2622,27 @@ cdef extern from "arrow/extension_type.h" namespace
"arrow":
shared_ptr[CArray] storage()
+cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace
"arrow::extension":
+ cdef cppclass CFixedShapeTensorType \
+ " arrow::extension::FixedShapeTensorType"(CExtensionType):
+
+ @staticmethod
+ CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]&
value_type,
+ const vector[int64_t]& shape,
+ const vector[int64_t]& permutation,
+ const vector[c_string]& dim_names)
+
+ CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType]
storage_type,
+ const c_string&
serialized_data) const
+
+ c_string Serialize() const
+
+ const shared_ptr[CDataType] value_type()
+ const vector[int64_t] shape()
+ const vector[int64_t] permutation()
+ const vector[c_string] dim_names()
+
+
cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
cdef enum CCompressionType" arrow::Compression::type":
CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index d984475171..54e14005f6 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -199,6 +199,11 @@ cdef class ExtensionType(BaseExtensionType):
const CPyExtensionType* cpy_ext_type
+cdef class FixedShapeTensorType(BaseExtensionType):
+ cdef:
+ const CFixedShapeTensorType* tensor_ext_type
+
+
cdef class PyExtensionType(ExtensionType):
pass
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index fadc659d45..72e16f2cec 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -118,6 +118,8 @@ cdef api object pyarrow_wrap_data_type(
cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
if cpy_ext_type != nullptr:
return cpy_ext_type.GetInstance()
+ elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
+ out = FixedShapeTensorType.__new__(FixedShapeTensorType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 96b3a9c26d..e6268823aa 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1144,6 +1144,102 @@ def test_cpp_extension_in_python(tmpdir):
assert reconstructed_array == array
+def test_tensor_type():
+ tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.int8(), 6)
+ assert tensor_type.shape == [2, 3]
+ assert tensor_type.dim_names is None
+ assert tensor_type.permutation is None
+
+ tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3],
+ permutation=[0, 2, 1])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.float64(), 12)
+ assert tensor_type.shape == [2, 2, 3]
+ assert tensor_type.dim_names is None
+ assert tensor_type.permutation == [0, 2, 1]
+
+ tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3],
+ dim_names=['C', 'H', 'W'])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.bool_(), 12)
+ assert tensor_type.shape == [2, 2, 3]
+ assert tensor_type.dim_names == ['C', 'H', 'W']
+ assert tensor_type.permutation is None
+
+
+def test_tensor_class_methods():
+ tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3])
+ storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
+ pa.list_(pa.float32(), 6))
+ arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+ expected = np.array(
+ [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
+ result = arr.to_numpy_ndarray()
+ np.testing.assert_array_equal(result, expected)
+
+ arr = np.array(
+ [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+ dtype=np.float32, order="C")
+ tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+ assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
+ assert tensor_array_from_numpy.type.value_type == pa.float32()
+ assert tensor_array_from_numpy.type.shape == [2, 3]
+
+ arr = np.array(
+ [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+ dtype=np.float32, order="F")
+ with pytest.raises(ValueError, match="C-style contiguous segment"):
+ pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+
+ tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0,
2, 1])
+ storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]],
pa.list_(pa.int8(), 12))
+ arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+ with pytest.raises(ValueError, match="non-permuted tensors"):
+ arr.to_numpy_ndarray()
+
+
[email protected]("tensor_type", (
+ pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
+ pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]),
+ pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], dim_names=['C', 'H', 'W'])
+))
+def test_tensor_type_ipc(tensor_type):
+ storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]],
pa.list_(pa.int8(), 12))
+ arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+ batch = pa.RecordBatch.from_arrays([arr], ["ext"])
+
+ # check the built array has exactly the expected clss
+ tensor_class = tensor_type.__arrow_ext_class__()
+ assert type(arr) == tensor_class
+
+ buf = ipc_write_batch(batch)
+ del batch
+ batch = ipc_read_batch(buf)
+
+ result = batch.column(0)
+ # check the deserialized array class is the expected one
+ assert type(result) == tensor_class
+ assert result.type.extension_name == "arrow.fixed_shape_tensor"
+ assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]]
+
+ # we get back an actual TensorType
+ assert isinstance(result.type, pa.FixedShapeTensorType)
+ assert result.type.value_type == pa.int8()
+ assert result.type.shape == [2, 2, 3]
+
+
+def test_tensor_type_equality():
+ tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+
+ tensor_type2 = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3])
+ tensor_type3 = pa.fixed_shape_tensor(pa.uint8(), [2, 2, 3])
+ assert tensor_type == tensor_type2
+ assert not tensor_type == tensor_type3
+
+
@pytest.mark.pandas
def test_extension_to_pandas_storage_type(registered_period_type):
period_type, _ = registered_period_type
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0e81706660..4ad2536368 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1494,6 +1494,86 @@ cdef class ExtensionType(BaseExtensionType):
"""
return ExtensionScalar
+
+cdef class FixedShapeTensorType(BaseExtensionType):
+ """
+ Concrete class for fixed shape tensor extension type.
+
+ Examples
+ --------
+ Create an instance of fixed shape tensor extension type:
+
+ >>> import pyarrow as pa
+ >>> pa.fixed_shape_tensor(pa.int32(), [2, 2])
+ FixedShapeTensorType(extension<arrow.fixed_shape_tensor>)
+
+ Create an instance of fixed shape tensor extension type with
+ permutation:
+
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+ ... permutation=[0, 2, 1])
+ >>> tensor_type.permutation
+ [0, 2, 1]
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ BaseExtensionType.init(self, type)
+ self.tensor_ext_type = <const CFixedShapeTensorType*> type.get()
+
+ @property
+ def value_type(self):
+ """
+ Data type of an individual tensor.
+ """
+ return pyarrow_wrap_data_type(self.tensor_ext_type.value_type())
+
+ @property
+ def shape(self):
+ """
+ Shape of the tensors.
+ """
+ return self.tensor_ext_type.shape()
+
+ @property
+ def dim_names(self):
+ """
+ Explicit names of the dimensions.
+ """
+ list_of_bytes = self.tensor_ext_type.dim_names()
+ if len(list_of_bytes) != 0:
+ return [frombytes(x) for x in list_of_bytes]
+ else:
+ return None
+
+ @property
+ def permutation(self):
+ """
+ Indices of the dimensions ordering.
+ """
+ indices = self.tensor_ext_type.permutation()
+ if len(indices) != 0:
+ return indices
+ else:
+ return None
+
+ def __arrow_ext_serialize__(self):
+ """
+ Serialized representation of metadata to reconstruct the type object.
+ """
+ return self.tensor_ext_type.Serialize()
+
+ @classmethod
+ def __arrow_ext_deserialize__(self, storage_type, serialized):
+ """
+ Return an FixedShapeTensor type instance from the storage type and
serialized
+ metadata.
+ """
+ return self.tensor_ext_type.Deserialize(storage_type, serialized)
+
+ def __arrow_ext_class__(self):
+ return FixedShapeTensorArray
+
+
cdef class PyExtensionType(ExtensionType):
"""
Concrete base class for Python-defined extension types based on pickle
@@ -4543,6 +4623,106 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def fixed_shape_tensor(DataType value_type, shape, dim_names=None,
permutation=None):
+ """
+ Create instance of fixed shape tensor extension type with shape and
optional
+ names of tensor dimensions and indices of the desired logical
+ ordering of dimensions.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ shape : tuple or list of integers
+ The physical shape of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
+ The indices contain a permutation of the values ``[0, 1, .., N-1]``
where
+ N is the number of dimensions. The permutation indicates which
dimension
+ of the logical layout corresponds to which dimension of the physical
tensor.
+ For more information on this parameter see
+ :ref:`fixed_shape_tensor_extension`.
+
+ Examples
+ --------
+ Create an instance of fixed shape tensor extension type:
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+ >>> tensor_type
+ FixedShapeTensorType(extension<arrow.fixed_shape_tensor>)
+
+ Inspect the data type:
+
+ >>> tensor_type.value_type
+ DataType(int32)
+ >>> tensor_type.shape
+ [2, 2]
+
+ Create a table with fixed shape tensor extension array:
+
+ >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage)
+ >>> pa.table([tensor], names=["tensor_array"])
+ pyarrow.Table
+ tensor_array: extension<arrow.fixed_shape_tensor>
+ ----
+ tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]]
+
+ Create an instance of fixed shape tensor extension type with names
+ of tensor dimensions:
+
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+ ... dim_names=['C', 'H', 'W'])
+ >>> tensor_type.dim_names
+ ['C', 'H', 'W']
+
+ Create an instance of fixed shape tensor extension type with
+ permutation:
+
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+ ... permutation=[0, 2, 1])
+ >>> tensor_type.permutation
+ [0, 2, 1]
+
+ Returns
+ -------
+ type : FixedShapeTensorType
+ """
+
+ cdef:
+ vector[int64_t] c_shape
+ vector[int64_t] c_permutation
+ vector[c_string] c_dim_names
+ shared_ptr[CDataType] c_tensor_ext_type
+
+ assert value_type is not None
+ assert shape is not None
+
+ for i in shape:
+ c_shape.push_back(i)
+
+ if permutation is not None:
+ for i in permutation:
+ c_permutation.push_back(i)
+
+ if dim_names is not None:
+ for x in dim_names:
+ c_dim_names.push_back(tobytes(x))
+
+ cdef FixedShapeTensorType out =
FixedShapeTensorType.__new__(FixedShapeTensorType)
+
+ c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make(
+ value_type.sp_type, c_shape, c_permutation, c_dim_names))
+
+ out.init(c_tensor_ext_type)
+
+ return out
+
+
cdef dict _type_aliases = {
'null': null,
'bool': bool_,