This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new d21c1c79c0 GH-34882: [Python] Binding for FixedShapeTensorType (#34883)
d21c1c79c0 is described below

commit d21c1c79c0d6377d035bd90da5d5d09e04e49079
Author: Alenka Frim <[email protected]>
AuthorDate: Tue Apr 11 16:37:03 2023 +0200

    GH-34882: [Python] Binding for FixedShapeTensorType (#34883)
    
    ### Rationale for this change
    In the C++ the fixed shape tensor canonical extension type is implementated 
https://github.com/apache/arrow/pull/8510 so we can add bindings to the 
extension type in Python.
    
    ### What changes are included in this PR?
    Binding for fixed shape tensor canonical extension type.
    
    ### Are these changes tested?
    Yes.
    
    ### Are there any user-facing changes?
    No.
    * Closes: #34882
    
    Lead-authored-by: Alenka Frim <[email protected]>
    Co-authored-by: Alenka Frim <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Co-authored-by: Rok Mihevc <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 docs/source/format/CanonicalExtensions.rst  |   2 +
 python/pyarrow/__init__.py                  |   5 +-
 python/pyarrow/array.pxi                    | 109 +++++++++++++++++
 python/pyarrow/includes/libarrow.pxd        |  21 ++++
 python/pyarrow/lib.pxd                      |   5 +
 python/pyarrow/public-api.pxi               |   2 +
 python/pyarrow/tests/test_extension_type.py |  96 +++++++++++++++
 python/pyarrow/types.pxi                    | 180 ++++++++++++++++++++++++++++
 8 files changed, 418 insertions(+), 2 deletions(-)

diff --git a/docs/source/format/CanonicalExtensions.rst 
b/docs/source/format/CanonicalExtensions.rst
index 92dc1b2db9..5dd269ee5c 100644
--- a/docs/source/format/CanonicalExtensions.rst
+++ b/docs/source/format/CanonicalExtensions.rst
@@ -72,6 +72,8 @@ same rules as laid out above, and provide backwards 
compatibility guarantees.
 Official List
 =============
 
+.. _fixed_shape_tensor_extension:
+
 Fixed shape tensor
 ==================
 
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 6ed2df080d..ecbce5c4d9 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -170,6 +170,7 @@ from pyarrow.lib import (null, bool_,
                          union, sparse_union, dense_union,
                          dictionary,
                          run_end_encoded,
+                         fixed_shape_tensor,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -178,7 +179,7 @@ from pyarrow.lib import (null, bool_,
                          TimestampType, Time32Type, Time64Type, DurationType,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
-                         RunEndEncodedType,
+                         RunEndEncodedType, FixedShapeTensorType,
                          PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
@@ -209,7 +210,7 @@ from pyarrow.lib import (null, bool_,
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, 
ExtensionArray,
-                         RunEndEncodedArray,
+                         RunEndEncodedArray, FixedShapeTensorArray,
                          scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 715c0aced6..11f10dddef 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -3075,6 +3075,115 @@ cdef class ExtensionArray(Array):
         return Array._to_pandas(self.storage, options, **kwargs)
 
 
+class FixedShapeTensorArray(ExtensionArray):
+    """
+    Concrete class for fixed shape tensor extension arrays.
+
+    Examples
+    --------
+    Define the extension type for tensor array
+
+    >>> import pyarrow as pa
+    >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+
+    Create an extension array
+
+    >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+    >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+    >>> pa.ExtensionArray.from_storage(tensor_type, storage)
+    <pyarrow.lib.FixedShapeTensorArray object at ...>
+    [
+      [
+        1,
+        2,
+        3,
+        4
+      ],
+      [
+        10,
+        20,
+        30,
+        40
+      ],
+      [
+        100,
+        200,
+        300,
+        400
+      ]
+    ]
+    """
+
+    def to_numpy_ndarray(self):
+        """
+        Convert fixed shape tensor extension array to a numpy array (with 
dim+1).
+
+        Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., 
len(shape)-1]``).
+        """
+        if self.type.permutation is None or self.type.permutation == 
list(range(len(self.type.shape))):
+            np_flat = np.asarray(self.storage.values)
+            numpy_tensor = np_flat.reshape((len(self),) + 
tuple(self.type.shape))
+            return numpy_tensor
+        else:
+            raise ValueError(
+                'Only non-permuted tensors can be converted to numpy tensors.')
+
+    @staticmethod
+    def from_numpy_ndarray(obj):
+        """
+        Convert numpy tensors (ndarrays) to a fixed shape tensor extension 
array.
+        The first dimension of ndarray will become the length of the fixed
+        shape tensor array.
+
+        Numpy array needs to be C-contiguous in memory
+        (``obj.flags["C_CONTIGUOUS"]==True``).
+
+        Parameters
+        ----------
+        obj : numpy.ndarray
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> import numpy as np
+        >>> arr = np.array(
+        ...         [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+        ...         dtype=np.float32)
+        >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+        <pyarrow.lib.FixedShapeTensorArray object at ...>
+        [
+          [
+            1,
+            2,
+            3,
+            4,
+            5,
+            6
+          ],
+          [
+            1,
+            2,
+            3,
+            4,
+            5,
+            6
+          ]
+        ]
+        """
+        if not obj.flags["C_CONTIGUOUS"]:
+            raise ValueError('The data in the numpy array need to be in a 
single, '
+                             'C-style contiguous segment.')
+
+        arrow_type = from_numpy_dtype(obj.dtype)
+        shape = obj.shape[1:]
+        size = obj.size / obj.shape[0]
+
+        return ExtensionArray.from_storage(
+            fixed_shape_tensor(arrow_type, shape),
+            FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size)
+        )
+
+
 cdef dict _array_classes = {
     _Type_NA: NullArray,
     _Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 19c9bdc2da..8fc531dff0 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2622,6 +2622,27 @@ cdef extern from "arrow/extension_type.h" namespace 
"arrow":
         shared_ptr[CArray] storage()
 
 
+cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace 
"arrow::extension":
+    cdef cppclass CFixedShapeTensorType \
+            " arrow::extension::FixedShapeTensorType"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& 
value_type,
+                                            const vector[int64_t]& shape,
+                                            const vector[int64_t]& permutation,
+                                            const vector[c_string]& dim_names)
+
+        CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] 
storage_type,
+                                                   const c_string& 
serialized_data) const
+
+        c_string Serialize() const
+
+        const shared_ptr[CDataType] value_type()
+        const vector[int64_t] shape()
+        const vector[int64_t] permutation()
+        const vector[c_string] dim_names()
+
+
 cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
     cdef enum CCompressionType" arrow::Compression::type":
         CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index d984475171..54e14005f6 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -199,6 +199,11 @@ cdef class ExtensionType(BaseExtensionType):
         const CPyExtensionType* cpy_ext_type
 
 
+cdef class FixedShapeTensorType(BaseExtensionType):
+    cdef:
+        const CFixedShapeTensorType* tensor_ext_type
+
+
 cdef class PyExtensionType(ExtensionType):
     pass
 
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index fadc659d45..72e16f2cec 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -118,6 +118,8 @@ cdef api object pyarrow_wrap_data_type(
         cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
         if cpy_ext_type != nullptr:
             return cpy_ext_type.GetInstance()
+        elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
+            out = FixedShapeTensorType.__new__(FixedShapeTensorType)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index 96b3a9c26d..e6268823aa 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1144,6 +1144,102 @@ def test_cpp_extension_in_python(tmpdir):
     assert reconstructed_array == array
 
 
+def test_tensor_type():
+    tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
+    assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+    assert tensor_type.storage_type == pa.list_(pa.int8(), 6)
+    assert tensor_type.shape == [2, 3]
+    assert tensor_type.dim_names is None
+    assert tensor_type.permutation is None
+
+    tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3],
+                                        permutation=[0, 2, 1])
+    assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+    assert tensor_type.storage_type == pa.list_(pa.float64(), 12)
+    assert tensor_type.shape == [2, 2, 3]
+    assert tensor_type.dim_names is None
+    assert tensor_type.permutation == [0, 2, 1]
+
+    tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3],
+                                        dim_names=['C', 'H', 'W'])
+    assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+    assert tensor_type.storage_type == pa.list_(pa.bool_(), 12)
+    assert tensor_type.shape == [2, 2, 3]
+    assert tensor_type.dim_names == ['C', 'H', 'W']
+    assert tensor_type.permutation is None
+
+
+def test_tensor_class_methods():
+    tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3])
+    storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
+                       pa.list_(pa.float32(), 6))
+    arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+    expected = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
+    result = arr.to_numpy_ndarray()
+    np.testing.assert_array_equal(result, expected)
+
+    arr = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+        dtype=np.float32, order="C")
+    tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+    assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
+    assert tensor_array_from_numpy.type.value_type == pa.float32()
+    assert tensor_array_from_numpy.type.shape == [2, 3]
+
+    arr = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+        dtype=np.float32, order="F")
+    with pytest.raises(ValueError, match="C-style contiguous segment"):
+        pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+
+    tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 
2, 1])
+    storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], 
pa.list_(pa.int8(), 12))
+    arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+    with pytest.raises(ValueError, match="non-permuted tensors"):
+        arr.to_numpy_ndarray()
+
+
[email protected]("tensor_type", (
+    pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
+    pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]),
+    pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], dim_names=['C', 'H', 'W'])
+))
+def test_tensor_type_ipc(tensor_type):
+    storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], 
pa.list_(pa.int8(), 12))
+    arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+    batch = pa.RecordBatch.from_arrays([arr], ["ext"])
+
+    # check the built array has exactly the expected clss
+    tensor_class = tensor_type.__arrow_ext_class__()
+    assert type(arr) == tensor_class
+
+    buf = ipc_write_batch(batch)
+    del batch
+    batch = ipc_read_batch(buf)
+
+    result = batch.column(0)
+    # check the deserialized array class is the expected one
+    assert type(result) == tensor_class
+    assert result.type.extension_name == "arrow.fixed_shape_tensor"
+    assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]]
+
+    # we get back an actual TensorType
+    assert isinstance(result.type, pa.FixedShapeTensorType)
+    assert result.type.value_type == pa.int8()
+    assert result.type.shape == [2, 2, 3]
+
+
+def test_tensor_type_equality():
+    tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3])
+    assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+
+    tensor_type2 = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3])
+    tensor_type3 = pa.fixed_shape_tensor(pa.uint8(), [2, 2, 3])
+    assert tensor_type == tensor_type2
+    assert not tensor_type == tensor_type3
+
+
 @pytest.mark.pandas
 def test_extension_to_pandas_storage_type(registered_period_type):
     period_type, _ = registered_period_type
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0e81706660..4ad2536368 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1494,6 +1494,86 @@ cdef class ExtensionType(BaseExtensionType):
         """
         return ExtensionScalar
 
+
+cdef class FixedShapeTensorType(BaseExtensionType):
+    """
+    Concrete class for fixed shape tensor extension type.
+
+    Examples
+    --------
+    Create an instance of fixed shape tensor extension type:
+
+    >>> import pyarrow as pa
+    >>> pa.fixed_shape_tensor(pa.int32(), [2, 2])
+    FixedShapeTensorType(extension<arrow.fixed_shape_tensor>)
+
+    Create an instance of fixed shape tensor extension type with
+    permutation:
+
+    >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+    ...                                     permutation=[0, 2, 1])
+    >>> tensor_type.permutation
+    [0, 2, 1]
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.tensor_ext_type = <const CFixedShapeTensorType*> type.get()
+
+    @property
+    def value_type(self):
+        """
+        Data type of an individual tensor.
+        """
+        return pyarrow_wrap_data_type(self.tensor_ext_type.value_type())
+
+    @property
+    def shape(self):
+        """
+        Shape of the tensors.
+        """
+        return self.tensor_ext_type.shape()
+
+    @property
+    def dim_names(self):
+        """
+        Explicit names of the dimensions.
+        """
+        list_of_bytes = self.tensor_ext_type.dim_names()
+        if len(list_of_bytes) != 0:
+            return [frombytes(x) for x in list_of_bytes]
+        else:
+            return None
+
+    @property
+    def permutation(self):
+        """
+        Indices of the dimensions ordering.
+        """
+        indices = self.tensor_ext_type.permutation()
+        if len(indices) != 0:
+            return indices
+        else:
+            return None
+
+    def __arrow_ext_serialize__(self):
+        """
+        Serialized representation of metadata to reconstruct the type object.
+        """
+        return self.tensor_ext_type.Serialize()
+
+    @classmethod
+    def __arrow_ext_deserialize__(self, storage_type, serialized):
+        """
+        Return an FixedShapeTensor type instance from the storage type and 
serialized
+        metadata.
+        """
+        return self.tensor_ext_type.Deserialize(storage_type, serialized)
+
+    def __arrow_ext_class__(self):
+        return FixedShapeTensorArray
+
+
 cdef class PyExtensionType(ExtensionType):
     """
     Concrete base class for Python-defined extension types based on pickle
@@ -4543,6 +4623,106 @@ def run_end_encoded(run_end_type, value_type):
     return pyarrow_wrap_data_type(ree_type)
 
 
+def fixed_shape_tensor(DataType value_type, shape, dim_names=None, 
permutation=None):
+    """
+    Create instance of fixed shape tensor extension type with shape and 
optional
+    names of tensor dimensions and indices of the desired logical
+    ordering of dimensions.
+
+    Parameters
+    ----------
+    value_type : DataType
+        Data type of individual tensor elements.
+    shape : tuple or list of integers
+        The physical shape of the contained tensors.
+    dim_names : tuple or list of strings, default None
+        Explicit names to tensor dimensions.
+    permutation : tuple or list integers, default None
+        Indices of the desired ordering of the original dimensions.
+        The indices contain a permutation of the values ``[0, 1, .., N-1]`` 
where
+        N is the number of dimensions. The permutation indicates which 
dimension
+        of the logical layout corresponds to which dimension of the physical 
tensor.
+        For more information on this parameter see
+        :ref:`fixed_shape_tensor_extension`.
+
+    Examples
+    --------
+    Create an instance of fixed shape tensor extension type:
+
+    >>> import pyarrow as pa
+    >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+    >>> tensor_type
+    FixedShapeTensorType(extension<arrow.fixed_shape_tensor>)
+
+    Inspect the data type:
+
+    >>> tensor_type.value_type
+    DataType(int32)
+    >>> tensor_type.shape
+    [2, 2]
+
+    Create a table with fixed shape tensor extension array:
+
+    >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+    >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+    >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage)
+    >>> pa.table([tensor], names=["tensor_array"])
+    pyarrow.Table
+    tensor_array: extension<arrow.fixed_shape_tensor>
+    ----
+    tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]]
+
+    Create an instance of fixed shape tensor extension type with names
+    of tensor dimensions:
+
+    >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+    ...                                     dim_names=['C', 'H', 'W'])
+    >>> tensor_type.dim_names
+    ['C', 'H', 'W']
+
+    Create an instance of fixed shape tensor extension type with
+    permutation:
+
+    >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+    ...                                     permutation=[0, 2, 1])
+    >>> tensor_type.permutation
+    [0, 2, 1]
+
+    Returns
+    -------
+    type : FixedShapeTensorType
+    """
+
+    cdef:
+        vector[int64_t] c_shape
+        vector[int64_t] c_permutation
+        vector[c_string] c_dim_names
+        shared_ptr[CDataType] c_tensor_ext_type
+
+    assert value_type is not None
+    assert shape is not None
+
+    for i in shape:
+        c_shape.push_back(i)
+
+    if permutation is not None:
+        for i in permutation:
+            c_permutation.push_back(i)
+
+    if dim_names is not None:
+        for x in dim_names:
+            c_dim_names.push_back(tobytes(x))
+
+    cdef FixedShapeTensorType out = 
FixedShapeTensorType.__new__(FixedShapeTensorType)
+
+    c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make(
+        value_type.sp_type, c_shape, c_permutation, c_dim_names))
+
+    out.init(c_tensor_ext_type)
+
+    return out
+
+
 cdef dict _type_aliases = {
     'null': null,
     'bool': bool_,

Reply via email to