Re: [PR] GH-47520: [C++][Tensor] Correct sparse tensor creation from dense tensor with negative zero [arrow]

via GitHub Mon, 22 Sep 2025 09:30:21 -0700


rok commented on code in PR #47586:
URL: https://github.com/apache/arrow/pull/47586#discussion_r2368236968



##########
cpp/src/arrow/tensor/converter.h:
##########
@@ -63,5 +66,55 @@ Result<std::shared_ptr<Tensor>> 
MakeTensorFromSparseCSCMatrix(
 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
     MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
 
+template <typename Convertor>

Review Comment:
   ```suggestion
   template <typename Converter>
   ```
   Appears multiple times in this file.



##########
cpp/src/arrow/tensor/converter.h:
##########
@@ -63,5 +66,55 @@ Result<std::shared_ptr<Tensor>> 
MakeTensorFromSparseCSCMatrix(
 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
     MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
 
+template <typename Convertor>
+struct ConverterVisitor {
+  explicit ConverterVisitor(Convertor& converter) : converter(converter) {}
+  template <typename ValueType, typename IndexType>
+  Status operator()(const ValueType& value, const IndexType& index_type) {
+    return converter.Convert(value, index_type);
+  }
+
+  Convertor& converter;
+};
+
+struct ValueTypeVisitor {
+  template <typename ValueType, typename IndexType, typename Function>
+  enable_if_number<ValueType, Status> Visit(const ValueType& value_type,
+                                            const IndexType& index_type,
+                                            Function&& function) {
+    return function(value_type, index_type);
+  }
+
+  template <typename IndexType, typename Function>
+  Status Visit(const DataType& value_type, const IndexType&, Function&&) {
+    return Status::Invalid("Invalid value type and the type is ", 
value_type.name());
+  }
+};

Review Comment:
   ```suggestion
     template <typename IndexType, typename Function>
     Status Visit(const DataType& value_type, const IndexType&, Function&&) {
       return Status::Invalid("Invalid value type: ", value_type.name(), ". 
Expected a number.");
     }
   };
   ```
   
   Do we actually need this (I've not used VisitTypeInline before so maybe 
stupid question)?



##########
cpp/src/arrow/tensor.h:
##########
@@ -55,6 +55,13 @@ constexpr bool is_tensor_supported(Type::type type_id) {
 
 namespace internal {
 
+// TODO(GH-47578): Enable HalfFloatType
+template <typename ValueDataType>
+inline bool is_not_zero(typename ValueDataType::c_type value) {
+  typename ValueDataType::c_type zero = 0;
+  return value != zero;
+}
+

Review Comment:
   Do we really need this as a separate function?



##########
cpp/src/arrow/tensor/coo_converter.cc:
##########
@@ -180,34 +181,29 @@ class SparseCOOTensorConverter : private 
SparseTensorConverterMixin {
 
     ARROW_ASSIGN_OR_RAISE(auto indices_buffer,
                           AllocateBuffer(index_elsize * ndim * nonzero_count, 
pool_));
-    uint8_t* indices = indices_buffer->mutable_data();
 
     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           AllocateBuffer(value_elsize * nonzero_count, pool_));
-    uint8_t* values = values_buffer->mutable_data();
 
-    const uint8_t* tensor_data = tensor_.raw_data();
+    auto* values = values_buffer->mutable_data_as<ValueCType>();
+    const auto* tensor_data = tensor_.data()->data_as<ValueCType>();
+    auto* indices = indices_buffer->mutable_data_as<IndexCType>();
     if (ndim <= 1) {
       const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0];
       for (int64_t i = 0; i < count; ++i) {
-        if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) {
-          AssignIndex(indices, i, index_elsize);
-          std::copy_n(tensor_data, value_elsize, values);
-
-          indices += index_elsize;
-          values += value_elsize;
+        if (is_not_zero<ValueType>(*tensor_data)) {
+          *indices++ = static_cast<IndexCType>(i);
+          *values++ = *tensor_data;
         }
-        tensor_data += value_elsize;
+        ++tensor_data;

Review Comment:
   This seems less error prone.



##########
cpp/src/arrow/tensor/coo_converter.cc:
##########
@@ -180,34 +181,29 @@ class SparseCOOTensorConverter : private 
SparseTensorConverterMixin {
 
     ARROW_ASSIGN_OR_RAISE(auto indices_buffer,
                           AllocateBuffer(index_elsize * ndim * nonzero_count, 
pool_));
-    uint8_t* indices = indices_buffer->mutable_data();
 
     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           AllocateBuffer(value_elsize * nonzero_count, pool_));
-    uint8_t* values = values_buffer->mutable_data();
 
-    const uint8_t* tensor_data = tensor_.raw_data();
+    auto* values = values_buffer->mutable_data_as<ValueCType>();
+    const auto* tensor_data = tensor_.data()->data_as<ValueCType>();
+    auto* indices = indices_buffer->mutable_data_as<IndexCType>();
     if (ndim <= 1) {
       const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0];
       for (int64_t i = 0; i < count; ++i) {
-        if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) {
-          AssignIndex(indices, i, index_elsize);
-          std::copy_n(tensor_data, value_elsize, values);
-
-          indices += index_elsize;
-          values += value_elsize;
+        if (is_not_zero<ValueType>(*tensor_data)) {
+          *indices++ = static_cast<IndexCType>(i);
+          *values++ = *tensor_data;
         }
-        tensor_data += value_elsize;
+        ++tensor_data;
       }
     } else if (tensor_.is_row_major()) {
-      DISPATCH(CONVERT_ROW_MAJOR_TENSOR, index_elsize, value_elsize, indices, 
values,
-               nonzero_count);
+      ConvertRowMajorTensor<IndexType, ValueType>(tensor_, indices, values);
     } else if (tensor_.is_column_major()) {
-      DISPATCH(CONVERT_COLUMN_MAJOR_TENSOR, index_elsize, value_elsize, 
indices, values,
-               nonzero_count);
+      ConvertColumnMajorTensor<IndexType, ValueType>(tensor_, indices, values,
+                                                     nonzero_count);
     } else {
-      DISPATCH(CONVERT_STRIDED_TENSOR, index_elsize, value_elsize, indices, 
values,
-               nonzero_count);
+      ConvertStridedTensor<IndexType, ValueType>(tensor_, indices, values);

Review Comment:
   Switching from DISPATCH to templates seems ok to me, but perhaps @pitrou can 
double-check?



##########
cpp/src/arrow/tensor/converter.h:
##########
@@ -63,5 +66,55 @@ Result<std::shared_ptr<Tensor>> 
MakeTensorFromSparseCSCMatrix(
 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
     MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
 
+template <typename Convertor>
+struct ConverterVisitor {
+  explicit ConverterVisitor(Convertor& converter) : converter(converter) {}
+  template <typename ValueType, typename IndexType>
+  Status operator()(const ValueType& value, const IndexType& index_type) {
+    return converter.Convert(value, index_type);
+  }
+
+  Convertor& converter;
+};
+
+struct ValueTypeVisitor {
+  template <typename ValueType, typename IndexType, typename Function>
+  enable_if_number<ValueType, Status> Visit(const ValueType& value_type,
+                                            const IndexType& index_type,
+                                            Function&& function) {
+    return function(value_type, index_type);
+  }
+
+  template <typename IndexType, typename Function>
+  Status Visit(const DataType& value_type, const IndexType&, Function&&) {
+    return Status::Invalid("Invalid value type and the type is ", 
value_type.name());
+  }
+};
+
+struct IndexAndValueTypeVisitor {
+  template <typename IndexType, typename Function>
+  enable_if_integer<IndexType, Status> Visit(const IndexType& index_type,
+                                             const std::shared_ptr<DataType>& 
value_type,
+                                             Function&& function) {
+    ValueTypeVisitor visitor;
+    return VisitTypeInline(*value_type, &visitor, index_type,
+                           std::forward<Function>(function));
+  }
+
+  template <typename Function>
+  Status Visit(const DataType& type, const std::shared_ptr<DataType>&, 
Function&&) {
+    return Status::Invalid("Invalid index type and the type is ", type.name());
+  }

Review Comment:
   ```suggestion
     template <typename Function>
     Status Visit(const DataType& type, const std::shared_ptr<DataType>&, 
Function&&) {
       return Status::Invalid("Invalid index type: ", type.name(), ". Expected 
integer.");
     }
   ```
   Same question as above - do we need this?



##########
cpp/src/arrow/tensor/coo_converter.cc:
##########
@@ -25,46 +25,48 @@
 
 #include "arrow/buffer.h"
 #include "arrow/status.h"
+#include "arrow/tensor.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/macros.h"
-#include "arrow/visit_type_inline.h"
 
 namespace arrow {
 
 class MemoryPool;
 
 namespace internal {
+
 namespace {
 
-template <typename c_index_type>
-inline void IncrementRowMajorIndex(std::vector<c_index_type>& coord,
+template <typename IndexCType>
+inline void IncrementRowMajorIndex(std::vector<IndexCType>& coord,
                                    const std::vector<int64_t>& shape) {
   const int64_t ndim = shape.size();
   ++coord[ndim - 1];
-  if (coord[ndim - 1] == shape[ndim - 1]) {
+  if (static_cast<int64_t>(coord[ndim - 1]) == shape[ndim - 1]) {
     int64_t d = ndim - 1;
-    while (d > 0 && coord[d] == shape[d]) {
+    while (d > 0 && static_cast<int64_t>(coord[d]) == shape[d]) {
       coord[d] = 0;
       ++coord[d - 1];
       --d;
     }
   }
 }
 
-template <typename c_index_type, typename c_value_type>
-void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices,
-                           c_value_type* values, const int64_t size) {
+template <typename IndexType, typename ValueType>
+void ConvertContinuousTensor(const Tensor& tensor, typename IndexType::c_type* 
indices,
+                             typename ValueType::c_type* values) {
+  using ValueCType = typename ValueType::c_type;
+  using IndexCType = typename IndexType::c_type;

Review Comment:
   Is this related to the constructor being row/column major agnostic? Would it 
be a lot of effort to move these changes to a separate PR?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47520: [C++][Tensor] Correct sparse tensor creation from dense tensor with negative zero [arrow]

Reply via email to