rok commented on code in PR #37533:
URL: https://github.com/apache/arrow/pull/37533#discussion_r1476033545
##########
cpp/src/arrow/extension/fixed_shape_tensor.h:
##########
@@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public
ExtensionType {
/// Create a FixedShapeTensorArray from ArrayData
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const
override;
+ /// \brief Create a Tensor from a Scalar value from a FixedShapeTensorArray
+ ///
+ /// This method will return a Tensor from FixedShapeTensorArray with strides
Review Comment:
Changed.
##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -293,53 +337,83 @@ const Result<std::shared_ptr<Tensor>>
FixedShapeTensorArray::ToTensor() const {
// To convert an array of n dimensional tensors to a n+1 dimensional tensor
we
// interpret the array's length as the first dimension the new tensor.
- auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
- auto ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
- ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
- Status::Invalid(ext_arr->value_type()->ToString(),
- " is not valid data type for a tensor"));
- auto permutation = ext_type->permutation();
+ const auto ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
+ const auto value_type = ext_type->value_type();
+ ARROW_RETURN_IF(
+ !is_fixed_width(*value_type),
+ Status::TypeError(value_type->ToString(), " is not valid data type for a
tensor"));
- std::vector<std::string> dim_names;
- if (!ext_type->dim_names().empty()) {
- for (auto i : permutation) {
- dim_names.emplace_back(ext_type->dim_names()[i]);
+ std::vector<int64_t> permutation = ext_type->permutation();
+ if (permutation.empty()) {
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
+ permutation.emplace_back(i);
}
- dim_names.insert(dim_names.begin(), 1, "");
- } else {
- dim_names = {};
}
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
+ permutation[i] += 1;
+ }
+ permutation.insert(permutation.begin(), 1, 0);
- std::vector<int64_t> shape;
- for (int64_t& i : permutation) {
- shape.emplace_back(ext_type->shape()[i]);
- ++i;
+ std::vector<std::string> dim_names = ext_type->dim_names();
+ if (!dim_names.empty()) {
+ dim_names.insert(dim_names.begin(), 1, "");
+ internal::Permute<std::string>(permutation, &dim_names);
}
+
+ std::vector<int64_t> shape = ext_type->shape();
+ auto cell_size = std::accumulate(shape.begin(), shape.end(),
static_cast<int64_t>(1),
+ std::multiplies<>());
shape.insert(shape.begin(), 1, this->length());
- permutation.insert(permutation.begin(), 1, 0);
+ internal::Permute<int64_t>(permutation, &shape);
std::vector<int64_t> tensor_strides;
- auto value_type =
internal::checked_pointer_cast<FixedWidthType>(ext_arr->value_type());
+ const auto fw_value_type =
internal::checked_pointer_cast<FixedWidthType>(value_type);
ARROW_RETURN_NOT_OK(
- ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides));
- ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten());
+ ComputeStrides(*fw_value_type.get(), shape, permutation,
&tensor_strides));
+
+ const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1];
ARROW_ASSIGN_OR_RAISE(
- auto tensor, Tensor::Make(ext_arr->value_type(),
buffers->data()->buffers[1], shape,
- tensor_strides, dim_names));
- return tensor;
+ const auto buffer,
+ SliceBufferSafe(raw_buffer, this->offset() * cell_size *
value_type->byte_width()));
+
+ return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names);
}
Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make(
const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>&
shape,
const std::vector<int64_t>& permutation, const std::vector<std::string>&
dim_names) {
- if (!permutation.empty() && shape.size() != permutation.size()) {
- return Status::Invalid("permutation size must match shape size. Expected:
",
- shape.size(), " Got: ", permutation.size());
+ const auto ndim = shape.size();
+ if (!permutation.empty() && ndim != permutation.size()) {
+ return Status::Invalid("permutation size must match shape size. Expected:
", ndim,
+ " Got: ", permutation.size());
}
- if (!dim_names.empty() && shape.size() != dim_names.size()) {
- return Status::Invalid("dim_names size must match shape size. Expected: ",
- shape.size(), " Got: ", dim_names.size());
+ if (!dim_names.empty() && ndim != dim_names.size()) {
+ return Status::Invalid("dim_names size must match shape size. Expected: ",
ndim,
+ " Got: ", dim_names.size());
+ }
+ if (!permutation.empty()) {
+ std::vector<int64_t> sorted_permutation = permutation;
+ std::sort(sorted_permutation.begin(), sorted_permutation.end());
Review Comment:
1. Created `tensor_internal.h` and moved the function there.
2. Indeed. Changed to your proposal.
##########
python/pyarrow/array.pxi:
##########
@@ -3519,17 +3519,42 @@ class FixedShapeTensorArray(ExtensionArray):
def to_numpy_ndarray(self):
Review Comment:
We discussed this here:
https://github.com/apache/arrow/pull/33948#pullrequestreview-1299455833
I would personally prefer `to_numpy`, but perhaps we can have a discussion
about this under a separate issue?
cc @jorisvandenbossche @AlenkaF
##########
cpp/src/arrow/extension/fixed_shape_tensor_test.cc:
##########
@@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) {
ASSERT_EQ(expected_3, result_3);
}
+TEST_F(TestExtensionType, GetScalar) {
+ auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {},
dim_names_);
+
+ auto expected_data =
+ ArrayFromJSON(element_type_, "[[12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23]]");
+ auto storage_array = ArrayFromJSON(element_type_,
+ "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],"
+ "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23]]");
+
+ auto sub_array = ExtensionType::WrapArray(ext_type, expected_data);
+ auto array = ExtensionType::WrapArray(ext_type, storage_array);
+
+ ASSERT_OK_AND_ASSIGN(auto expected_scalar, sub_array->GetScalar(0));
+ ASSERT_OK_AND_ASSIGN(auto actual_scalar, array->GetScalar(1));
+
+ ASSERT_OK(actual_scalar->ValidateFull());
+ ASSERT_TRUE(actual_scalar->type->Equals(*ext_type));
+ ASSERT_TRUE(actual_scalar->is_valid);
+
+ ASSERT_OK(expected_scalar->ValidateFull());
+ ASSERT_TRUE(expected_scalar->type->Equals(*ext_type));
+ ASSERT_TRUE(expected_scalar->is_valid);
+
+ AssertTypeEqual(actual_scalar->type, ext_type);
+ ASSERT_TRUE(actual_scalar->Equals(*expected_scalar));
+}
+
+TEST_F(TestExtensionType, GetTensor) {
+ auto arr = ArrayFromJSON(element_type_,
+ "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],"
+ "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23]]");
+ auto element_values =
+ std::vector<std::vector<int64_t>>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+ {12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23}};
+
+ auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {},
dim_names_);
+ auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0},
{"x", "y"});
+ auto exact_ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(ext_type);
+ auto exact_permuted_ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(permuted_ext_type);
+
+ auto array = std::static_pointer_cast<FixedShapeTensorArray>(
+ ExtensionType::WrapArray(ext_type, arr));
+ auto permuted_array = std::static_pointer_cast<FixedShapeTensorArray>(
+ ExtensionType::WrapArray(permuted_ext_type, arr));
+
+ for (size_t i = 0; i < element_values.size(); i++) {
+ // Get tensor from extension array with trivial permutation
+ ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i));
+ auto actual_ext_scalar =
internal::checked_pointer_cast<ExtensionScalar>(scalar);
+ ASSERT_OK_AND_ASSIGN(auto actual_tensor,
+ exact_ext_type->MakeTensor(actual_ext_scalar));
+ ASSERT_OK_AND_ASSIGN(auto expected_tensor,
Review Comment:
Added here and below.
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1318,39 +1318,101 @@ def test_tensor_type():
assert tensor_type.permutation is None
-def test_tensor_class_methods():
- tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3])
- storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
- pa.list_(pa.float32(), 6))
[email protected]("value_type", (np.int8(), np.int64(), np.float32()))
+def test_tensor_class_methods(value_type):
+ from numpy.lib.stride_tricks import as_strided
+ arrow_type = pa.from_numpy_dtype(value_type)
+
+ tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3])
+ storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+ pa.list_(arrow_type, 6))
arr = pa.ExtensionArray.from_storage(tensor_type, storage)
expected = np.array(
- [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
- result = arr.to_numpy_ndarray()
+ [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+ np.testing.assert_array_equal(arr.to_tensor(), expected)
+ np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected)
+
+ expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+ result = arr[1:].to_numpy_ndarray()
np.testing.assert_array_equal(result, expected)
- expected = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
- result = arr[:1].to_numpy_ndarray()
+ values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
+ flat_arr = np.array(values[0], dtype=value_type)
+ bw = value_type.itemsize
+ storage = pa.array(values, pa.list_(arrow_type, 12))
+
+ tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0,
1, 2])
+ result = pa.ExtensionArray.from_storage(tensor_type, storage)
+ expected = np.array(
+ [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
dtype=value_type)
+ np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+ result = flat_arr.reshape(1, 2, 3, 2)
+ expected = np.array(
+ [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
dtype=value_type)
np.testing.assert_array_equal(result, expected)
- arr = np.array(
- [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
- dtype=np.float32, order="C")
+ tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0,
2, 1])
+ result = pa.ExtensionArray.from_storage(tensor_type, storage)
+ expected = as_strided(flat_arr, shape=(1, 2, 3, 2),
+ strides=(bw * 12, bw * 6, bw, bw * 3))
+ np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+ tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2,
0, 1])
+ result = pa.ExtensionArray.from_storage(tensor_type, storage)
+ expected = as_strided(flat_arr, shape=(1, 3, 2, 2),
+ strides=(bw * 12, bw, bw * 6, bw * 2))
+ np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+ assert result.type.permutation == [2, 0, 1]
+ assert result.type.shape == [2, 2, 3]
+ assert result.to_tensor().shape == (1, 3, 2, 2)
+ assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw)
+
+
[email protected]("value_type", (np.int8(), np.int64(), np.float32()))
+def test_tensor_array_from_numpy(value_type):
+ from numpy.lib.stride_tricks import as_strided
+ arrow_type = pa.from_numpy_dtype(value_type)
+
+ arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+ dtype=value_type, order="C")
tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
- assert tensor_array_from_numpy.type.value_type == pa.float32()
+ assert tensor_array_from_numpy.type.value_type == arrow_type
assert tensor_array_from_numpy.type.shape == [2, 3]
- arr = np.array(
- [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
- dtype=np.float32, order="F")
- with pytest.raises(ValueError, match="C-style contiguous segment"):
+ arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
+ dtype=value_type, order="F")
+ with pytest.raises(ValueError, match="First stride needs to be largest"):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
- tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0,
2, 1])
- storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]],
pa.list_(pa.int8(), 12))
- arr = pa.ExtensionArray.from_storage(tensor_type, storage)
- with pytest.raises(ValueError, match="non-permuted tensors"):
- arr.to_numpy_ndarray()
+ flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
dtype=value_type)
+ bw = value_type.itemsize
+
+ arr = flat_arr.reshape(1, 3, 4)
+ tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+ assert tensor_array_from_numpy.type.shape == [3, 4]
+ assert tensor_array_from_numpy.type.permutation == [0, 1]
+ assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr)
+
+ arr = as_strided(flat_arr, shape=(1, 2, 3, 2),
+ strides=(bw * 12, bw * 6, bw, bw * 3))
+ tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+ assert tensor_array_from_numpy.type.shape == [2, 2, 3]
+ assert tensor_array_from_numpy.type.permutation == [0, 2, 1]
+ assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr)
+
+ arr = flat_arr.reshape(1, 2, 3, 2)
+ result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+ expected = np.array(
+ [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
dtype=value_type)
+ np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+ arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
dtype=value_type)
+ expected = arr[1:]
+ result =
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray()
+ np.testing.assert_array_equal(result, expected)
Review Comment:
Added. I think it also makes sense to throw errors for these cases so I
added checks in Python.
##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -293,40 +335,49 @@ const Result<std::shared_ptr<Tensor>>
FixedShapeTensorArray::ToTensor() const {
// To convert an array of n dimensional tensors to a n+1 dimensional tensor
we
// interpret the array's length as the first dimension the new tensor.
- auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
- auto ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
- ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
- Status::Invalid(ext_arr->value_type()->ToString(),
- " is not valid data type for a tensor"));
- auto permutation = ext_type->permutation();
+ const auto ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
+ const auto value_type = ext_type->value_type();
+ ARROW_RETURN_IF(
+ !is_fixed_width(*value_type),
+ Status::Invalid(value_type->ToString(), " is not valid data type for a
tensor"));
- std::vector<std::string> dim_names;
- if (!ext_type->dim_names().empty()) {
- for (auto i : permutation) {
- dim_names.emplace_back(ext_type->dim_names()[i]);
+ std::vector<int64_t> permutation = ext_type->permutation();
+ if (permutation.empty()) {
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
+ permutation.emplace_back(i);
}
- dim_names.insert(dim_names.begin(), 1, "");
- } else {
- dim_names = {};
}
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
+ permutation[i] += 1;
Review Comment:
Added.
##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -203,6 +205,48 @@ std::shared_ptr<Array> FixedShapeTensorType::MakeArray(
return std::make_shared<ExtensionArray>(data);
}
+Result<std::shared_ptr<Tensor>> FixedShapeTensorType::MakeTensor(
+ const std::shared_ptr<ExtensionScalar>& scalar) {
+ const auto ext_scalar =
internal::checked_pointer_cast<ExtensionScalar>(scalar);
+ const auto ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(scalar->type);
+ if (!is_fixed_width(*ext_type->value_type())) {
+ return Status::TypeError("Cannot convert non-fixed-width values to
Tensor.");
+ }
+ const auto array =
+ internal::checked_pointer_cast<const
FixedSizeListScalar>(ext_scalar->value)->value;
+ if (array->null_count() > 0) {
+ return Status::Invalid("Cannot convert data with nulls to Tensor.");
+ }
+ const auto value_type =
+ internal::checked_pointer_cast<FixedWidthType>(ext_type->value_type());
+ const auto byte_width = value_type->byte_width();
+
+ std::vector<int64_t> permutation = ext_type->permutation();
+ if (permutation.empty()) {
+ permutation.resize(ext_type->ndim());
+ std::iota(permutation.begin(), permutation.end(), 0);
+ }
+
+ std::vector<int64_t> shape = ext_type->shape();
+ internal::Permute<int64_t>(permutation, &shape);
+
+ std::vector<std::string> dim_names = ext_type->dim_names();
+ if (!dim_names.empty()) {
+ internal::Permute<std::string>(permutation, &dim_names);
+ }
+
+ std::vector<int64_t> strides;
+ ARROW_CHECK_OK(ComputeStrides(*value_type.get(), shape, permutation,
&strides));
Review Comment:
Done.
##########
cpp/src/arrow/extension/fixed_shape_tensor_test.cc:
##########
@@ -308,19 +319,89 @@ TEST_F(TestExtensionType, TestFromTensorType) {
auto dim_names = std::vector<std::vector<std::string>>{
{"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"},
{"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}};
- auto cell_shapes = std::vector<std::vector<int64_t>>{{3, 4}, {4, 3}, {4, 3},
{3, 4}};
+ auto element_shapes = std::vector<std::vector<int64_t>>{{3, 4}, {4, 3}, {4,
3}, {3, 4}};
auto permutations = std::vector<std::vector<int64_t>>{{0, 1}, {1, 0}, {0,
1}, {1, 0}};
for (size_t i = 0; i < shapes.size(); i++) {
ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, values,
shapes[i],
strides[i],
tensor_dim_names[i]));
ASSERT_OK_AND_ASSIGN(auto ext_arr,
FixedShapeTensorArray::FromTensor(tensor));
auto ext_type =
- fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i],
dim_names[i]);
+ fixed_shape_tensor(value_type_, element_shapes[i], permutations[i],
dim_names[i]);
CheckFromTensorType(tensor, ext_type);
}
}
+template <typename T>
+void CheckToTensor(const std::vector<T>& values, const
std::shared_ptr<DataType> typ,
+ const int32_t& element_size, const std::vector<int64_t>&
element_shape,
+ const std::vector<int64_t>& element_permutation,
+ const std::vector<std::string>& element_dim_names,
+ const std::vector<int64_t>& tensor_shape,
+ const std::vector<std::string>& tensor_dim_names,
+ const std::vector<int64_t>& tensor_strides) {
+ auto buffer = Buffer::Wrap(values);
+ const std::shared_ptr<DataType> element_type = fixed_size_list(typ,
element_size);
+ std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, buffer};
+ auto arr_data = std::make_shared<ArrayData>(typ, values.size(), buffers);
+ auto arr = std::make_shared<Int64Array>(arr_data);
+ ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr,
element_type));
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_tensor,
+ Tensor::Make(typ, buffer, tensor_shape, tensor_strides,
tensor_dim_names));
+ const auto ext_type =
+ fixed_shape_tensor(typ, element_shape, element_permutation,
element_dim_names);
+
+ auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr);
+ const auto tensor_array =
std::static_pointer_cast<FixedShapeTensorArray>(ext_arr);
+ ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor());
+
+ ASSERT_EQ(actual_tensor->type(), expected_tensor->type());
+ ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape());
+ ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides());
+ ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names());
+ ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data()));
+ ASSERT_TRUE(actual_tensor->Equals(*expected_tensor));
+}
+
+TEST_F(TestExtensionType, ToTensor) {
+ std::vector<float> values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
+
+ auto element_sizes = std::vector<int32_t>{6, 6, 18, 18, 18, 18};
+
+ auto element_shapes = std::vector<std::vector<int64_t>>{{2, 3}, {3, 2},
{3, 6},
+ {6, 3}, {3, 2, 3},
{3, 2, 3}};
+ auto tensor_shapes = std::vector<std::vector<int64_t>>{
+ {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}};
+
+ auto element_permutations = std::vector<std::vector<int64_t>>{
+ {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}};
+ auto tensor_strides_32 =
+ std::vector<std::vector<int64_t>>{{24, 12, 4}, {24, 4, 8}, {72, 24,
4},
+ {72, 4, 12}, {72, 24, 12, 4}, {72, 4,
12, 24}};
+ auto tensor_strides_64 =
+ std::vector<std::vector<int64_t>>{{48, 24, 8}, {48, 8, 16}, {144,
48, 8},
+ {144, 8, 24}, {144, 48, 24, 8}, {144,
8, 24, 48}};
+
+ auto element_dim_names = std::vector<std::vector<std::string>>{
+ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H",
"W", "C"}};
+ auto tensor_dim_names = std::vector<std::vector<std::string>>{
+ {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"},
+ {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}};
+
+ for (size_t i = 0; i < element_shapes.size(); i++) {
+ CheckToTensor<float_t>(values, float32(), element_sizes[i],
element_shapes[i],
+ element_permutations[i], element_dim_names[i],
+ tensor_shapes[i], tensor_dim_names[i],
tensor_strides_32[i]);
+ CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i],
Review Comment:
Removed explicit `float_t` and switched to: `std::vector<float_t>
float_values(values_.begin(), values_.end());`
##########
python/pyarrow/array.pxi:
##########
@@ -3573,17 +3602,19 @@ class FixedShapeTensorArray(ExtensionArray):
]
]
"""
- if not obj.flags["C_CONTIGUOUS"]:
- raise ValueError('The data in the numpy array need to be in a
single, '
- 'C-style contiguous segment.')
+
+ permutation = (-np.array(obj.strides)).argsort(kind='stable')
+ if permutation[0] != 0:
+ raise ValueError('First stride needs to be largest to ensure that '
+ 'individual tensor data is contiguous in memory.')
arrow_type = from_numpy_dtype(obj.dtype)
- shape = obj.shape[1:]
- size = obj.size / obj.shape[0]
+ shape = np.take(obj.shape, permutation)
+ values = np.ravel(obj, order="K")
Review Comment:
As per
https://github.com/apache/arrow/pull/37533#pullrequestreview-1779106422 `ravel`
shouldn't cause copy if memory layout doesn't change. And we're not actively
trying to change the memory layout here.
##########
cpp/src/arrow/extension/fixed_shape_tensor_test.cc:
##########
@@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) {
ASSERT_EQ(expected_3, result_3);
}
+TEST_F(TestExtensionType, GetScalar) {
+ auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {},
dim_names_);
+
+ auto expected_data =
+ ArrayFromJSON(element_type_, "[[12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23]]");
+ auto storage_array = ArrayFromJSON(element_type_,
+ "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],"
+ "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23]]");
+
+ auto sub_array = ExtensionType::WrapArray(ext_type, expected_data);
+ auto array = ExtensionType::WrapArray(ext_type, storage_array);
+
+ ASSERT_OK_AND_ASSIGN(auto expected_scalar, sub_array->GetScalar(0));
+ ASSERT_OK_AND_ASSIGN(auto actual_scalar, array->GetScalar(1));
+
+ ASSERT_OK(actual_scalar->ValidateFull());
+ ASSERT_TRUE(actual_scalar->type->Equals(*ext_type));
+ ASSERT_TRUE(actual_scalar->is_valid);
+
+ ASSERT_OK(expected_scalar->ValidateFull());
+ ASSERT_TRUE(expected_scalar->type->Equals(*ext_type));
+ ASSERT_TRUE(expected_scalar->is_valid);
+
+ AssertTypeEqual(actual_scalar->type, ext_type);
+ ASSERT_TRUE(actual_scalar->Equals(*expected_scalar));
+}
+
+TEST_F(TestExtensionType, GetTensor) {
+ auto arr = ArrayFromJSON(element_type_,
+ "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],"
+ "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23]]");
+ auto element_values =
+ std::vector<std::vector<int64_t>>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+ {12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23}};
+
+ auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {},
dim_names_);
+ auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0},
{"x", "y"});
+ auto exact_ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(ext_type);
+ auto exact_permuted_ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(permuted_ext_type);
+
+ auto array = std::static_pointer_cast<FixedShapeTensorArray>(
+ ExtensionType::WrapArray(ext_type, arr));
+ auto permuted_array = std::static_pointer_cast<FixedShapeTensorArray>(
+ ExtensionType::WrapArray(permuted_ext_type, arr));
+
+ for (size_t i = 0; i < element_values.size(); i++) {
+ // Get tensor from extension array with trivial permutation
+ ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i));
+ auto actual_ext_scalar =
internal::checked_pointer_cast<ExtensionScalar>(scalar);
+ ASSERT_OK_AND_ASSIGN(auto actual_tensor,
+ exact_ext_type->MakeTensor(actual_ext_scalar));
+ ASSERT_OK_AND_ASSIGN(auto expected_tensor,
+ Tensor::Make(value_type_,
Buffer::Wrap(element_values[i]),
+ {3, 4}, {}, {"x", "y"}));
+ ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape());
+ ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names());
+ ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides());
+ ASSERT_EQ(actual_tensor->strides(), std::vector<int64_t>({32, 8}));
+ ASSERT_EQ(expected_tensor->type(), actual_tensor->type());
+ ASSERT_TRUE(expected_tensor->Equals(*actual_tensor));
+
+ // Get tensor from extension array with non-trivial permutation
+ ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor,
+ Tensor::Make(value_type_,
Buffer::Wrap(element_values[i]),
+ {4, 3}, {8, 24}, {"y", "x"}));
+ ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i));
+ ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor,
+ exact_permuted_ext_type->MakeTensor(
+
internal::checked_pointer_cast<ExtensionScalar>(scalar)));
+ ASSERT_EQ(expected_permuted_tensor->strides(),
actual_permuted_tensor->strides());
+ ASSERT_EQ(expected_permuted_tensor->shape(),
actual_permuted_tensor->shape());
+ ASSERT_EQ(expected_permuted_tensor->dim_names(),
actual_permuted_tensor->dim_names());
+ ASSERT_EQ(expected_permuted_tensor->type(),
actual_permuted_tensor->type());
+ ASSERT_EQ(expected_permuted_tensor->is_contiguous(),
+ actual_permuted_tensor->is_contiguous());
+ ASSERT_EQ(expected_permuted_tensor->is_column_major(),
+ actual_permuted_tensor->is_column_major());
+ ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor));
+ }
+
+ // Test null values fail
+ auto element_type = fixed_size_list(int64(), 1);
+ auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]");
+ ext_type = fixed_shape_tensor(int64(), {1});
+ exact_ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(ext_type);
+ auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr);
+ auto tensor_array = std::static_pointer_cast<FixedShapeTensorArray>(ext_arr);
Review Comment:
`checked_pointer_cast` causes a segfault here. Not sure if it's an issue or
not.
##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -293,53 +337,83 @@ const Result<std::shared_ptr<Tensor>>
FixedShapeTensorArray::ToTensor() const {
// To convert an array of n dimensional tensors to a n+1 dimensional tensor
we
// interpret the array's length as the first dimension the new tensor.
- auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
- auto ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
- ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
- Status::Invalid(ext_arr->value_type()->ToString(),
- " is not valid data type for a tensor"));
- auto permutation = ext_type->permutation();
+ const auto ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
+ const auto value_type = ext_type->value_type();
+ ARROW_RETURN_IF(
+ !is_fixed_width(*value_type),
+ Status::TypeError(value_type->ToString(), " is not valid data type for a
tensor"));
- std::vector<std::string> dim_names;
- if (!ext_type->dim_names().empty()) {
- for (auto i : permutation) {
- dim_names.emplace_back(ext_type->dim_names()[i]);
+ std::vector<int64_t> permutation = ext_type->permutation();
+ if (permutation.empty()) {
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
Review Comment:
Indeed! Changed.
##########
python/pyarrow/array.pxi:
##########
@@ -3573,17 +3602,19 @@ class FixedShapeTensorArray(ExtensionArray):
]
]
"""
- if not obj.flags["C_CONTIGUOUS"]:
- raise ValueError('The data in the numpy array need to be in a
single, '
- 'C-style contiguous segment.')
+
+ permutation = (-np.array(obj.strides)).argsort(kind='stable')
+ if permutation[0] != 0:
+ raise ValueError('First stride needs to be largest to ensure that '
+ 'individual tensor data is contiguous in memory.')
arrow_type = from_numpy_dtype(obj.dtype)
- shape = obj.shape[1:]
- size = obj.size / obj.shape[0]
+ shape = np.take(obj.shape, permutation)
+ values = np.ravel(obj, order="K")
return ExtensionArray.from_storage(
- fixed_shape_tensor(arrow_type, shape),
- FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size)
+ fixed_shape_tensor(arrow_type, shape[1:],
permutation=permutation[1:] - 1),
+ FixedSizeListArray.from_arrays(values, obj[0].size)
Review Comment:
Changed.
##########
cpp/src/arrow/extension/fixed_shape_tensor_test.cc:
##########
@@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) {
ASSERT_EQ(expected_3, result_3);
}
+TEST_F(TestExtensionType, GetScalar) {
Review Comment:
Removed `GetScalar` test.
##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -293,53 +337,83 @@ const Result<std::shared_ptr<Tensor>>
FixedShapeTensorArray::ToTensor() const {
// To convert an array of n dimensional tensors to a n+1 dimensional tensor
we
// interpret the array's length as the first dimension the new tensor.
- auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
- auto ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
- ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
- Status::Invalid(ext_arr->value_type()->ToString(),
- " is not valid data type for a tensor"));
- auto permutation = ext_type->permutation();
+ const auto ext_type =
+ internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
+ const auto value_type = ext_type->value_type();
+ ARROW_RETURN_IF(
+ !is_fixed_width(*value_type),
+ Status::TypeError(value_type->ToString(), " is not valid data type for a
tensor"));
- std::vector<std::string> dim_names;
- if (!ext_type->dim_names().empty()) {
- for (auto i : permutation) {
- dim_names.emplace_back(ext_type->dim_names()[i]);
+ std::vector<int64_t> permutation = ext_type->permutation();
+ if (permutation.empty()) {
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
+ permutation.emplace_back(i);
}
- dim_names.insert(dim_names.begin(), 1, "");
- } else {
- dim_names = {};
}
+ for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
Review Comment:
Changed to a more explicit way.
##########
python/pyarrow/scalar.pxi:
##########
@@ -1027,6 +1027,32 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
+cdef class FixedShapeTensorScalar(ExtensionScalar):
+ """
+ Concrete class for fixed shape tensor extension scalar.
+ """
+
+ def to_numpy_ndarray(self):
Review Comment:
Indeed! Changed for the `Scalar` case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]