pitrou commented on code in PR #37536:
URL: https://github.com/apache/arrow/pull/37536#discussion_r1315085920


##########
cpp/src/arrow/compute/kernels/aggregate_internal.h:
##########
@@ -221,27 +221,34 @@ enable_if_t<std::is_floating_point<SumType>::value, 
SumType> SumArray(
 }
 
 // naive summation for integers and decimals
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel, 
bool Checked,
           typename ValueFunc>
 enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
-    const ArraySpan& data, ValueFunc&& func) {
+    const ArraySpan& data, ValueFunc&& func, Status* status) {
   using arrow::internal::VisitSetBitRunsVoid;
 
   SumType sum = 0;
   const ValueType* values = data.GetValues<ValueType>(1);
-  VisitSetBitRunsVoid(data.buffers[0].data, data.offset, data.length,
-                      [&](int64_t pos, int64_t len) {
-                        for (int64_t i = 0; i < len; ++i) {
-                          sum += func(values[pos + i]);
-                        }
-                      });
+  VisitSetBitRunsVoid(
+      data.buffers[0].data, data.offset, data.length, [&](int64_t pos, int64_t 
len) {
+        for (int64_t i = 0; i < len; ++i) {
+          if constexpr (Checked) {
+            sum = AddChecked::Call<SumType>(nullptr, sum, func(values[pos + 
i]), status);
+            if (ARROW_PREDICT_FALSE(!status->ok())) {

Review Comment:
   Returning here seems a bit pointless as the next bit runs will still be read 
anyway.
   
   Instead, perhaps you should move the `Checked` condition at the function 
toplevel and use `VisitSetBitRuns` (non-void) if checked summation is requested.



##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -176,11 +202,11 @@ struct MeanImpl;
 
 template <typename ArrowType, SimdLevel::type SimdLevel>
 struct MeanImpl<ArrowType, SimdLevel, enable_if_decimal<ArrowType>>
-    : public SumImpl<ArrowType, SimdLevel> {
-  using SumImpl<ArrowType, SimdLevel>::SumImpl;
-  using SumImpl<ArrowType, SimdLevel>::options;
-  using SumCType = typename SumImpl<ArrowType, SimdLevel>::SumCType;
-  using OutputType = typename SumImpl<ArrowType, SimdLevel>::OutputType;
+    : public SumImpl<ArrowType, SimdLevel, false> {

Review Comment:
   Suggesting, for here and other similar places: make it clear what the 
boolean parameter selects
   ```suggestion
       : public SumImpl<ArrowType, SimdLevel, /*Checked=*/ false> {
   ```



##########
cpp/src/arrow/compute/kernels/aggregate_var_std.cc:
##########
@@ -60,16 +60,19 @@ struct VarStdState {
     if (count == 0 || (!this->all_valid && !options.skip_nulls)) {
       return;
     }
-
+    Status status;
     using SumType = typename internal::GetSumType<T>::SumType;
-    SumType sum = internal::SumArray<CType, SumType, SimdLevel::NONE>(array);
+    SumType sum =
+        internal::SumArray<CType, SumType, SimdLevel::NONE, false>(array, 
&status);
 
     const double mean = ToDouble(sum) / count;
-    const double m2 = internal::SumArray<CType, double, SimdLevel::NONE>(
-        array, [this, mean](CType value) {
+    const double m2 = internal::SumArray<CType, double, SimdLevel::NONE, 
false>(
+        array,
+        [this, mean](CType value) {
           const double v = ToDouble(value);
           return (v - mean) * (v - mean);
-        });
+        },
+        &status);

Review Comment:
   Let's make sure we don't let an error slip here? (it shouldn't happen now 
but perhaps it will at some point?)
   ```suggestion
           &status);
       ARROW_CHECK_OK(status);
   ```



##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -49,20 +50,22 @@ void AddMinMaxKernel(KernelInit init, 
internal::detail::GetTypeId get_id,
 
 // SIMD variants for kernels
 void AddSumAvx2AggKernels(ScalarAggregateFunction* func);
+void AddSumCheckedAvx2AggKernels(ScalarAggregateFunction* func);
 void AddMeanAvx2AggKernels(ScalarAggregateFunction* func);
 void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func);
 
 void AddSumAvx512AggKernels(ScalarAggregateFunction* func);
+void AddSumCheckedAvx512AggKernels(ScalarAggregateFunction* func);
 void AddMeanAvx512AggKernels(ScalarAggregateFunction* func);
 void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
 
 // ----------------------------------------------------------------------
 // Sum implementation
 
-template <typename ArrowType, SimdLevel::type SimdLevel,
+template <typename ArrowType, SimdLevel::type SimdLevel, bool Checked,

Review Comment:
   TBH, I'm not sure `Checked` and `SimdLevel` need to be template parameters 
at this level. `SumArray` is performance critical, but the orchestration in 
`SumImpl` probably much less so.
   
   Not making those template parameters may save a bit on code generation and 
compilation times. Perhaps open an issue about that?



##########
cpp/src/arrow/compute/api_aggregate.h:
##########
@@ -284,6 +284,21 @@ Result<Datum> Sum(
     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
 
+/// \brief Sum values of a numeric array. Return error if overflow occurs
+///
+/// \param[in] value datum to sum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 1.0.0

Review Comment:
   We should not blindly copy this from another docstring :-) The next version 
to be released is 14.0.0.
   



##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -592,25 +652,51 @@ TEST(TestNullSumKernel, Basics) {
   Datum null_result = std::make_shared<Int64Scalar>();
   Datum zero_result = std::make_shared<Int64Scalar>(0);
 
-  EXPECT_THAT(Sum(ScalarFromJSON(ty, "null")), ResultWith(null_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]")), ResultWith(null_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]")), ResultWith(null_result));
-  EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"})),
-              ResultWith(null_result));
+  for (auto func : {Sum, SumChecked}) {
+    SCOPED_TRACE(func);
+    auto default_options = ScalarAggregateOptions::Defaults();
+    EXPECT_THAT(func(ScalarFromJSON(ty, "null"), default_options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), default_options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), default_options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, 
null]"}),
+                     default_options, nullptr),
+                ResultWith(null_result));
 
-  ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
-  EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options), 
ResultWith(zero_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options), 
ResultWith(zero_result));
-  EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}), 
options),
-              ResultWith(zero_result));
+    ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
+    EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+                ResultWith(zero_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr), 
ResultWith(zero_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+                ResultWith(zero_result));
+    EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, 
null]"}), options,
+                     nullptr),
+                ResultWith(zero_result));
+
+    options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
+    EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr), 
ResultWith(zero_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, 
null]"}), options,
+                     nullptr),
+                ResultWith(null_result));
+  }
+}
 
-  options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
-  EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options), 
ResultWith(null_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options), 
ResultWith(null_result));
-  EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}), 
options),
-              ResultWith(null_result));
+TEST(TestSumKernel, Overflow) {
+  int64_t large_scalar = 1000000000000000;
+  int64_t length = 10000;
+  auto scalar = std::make_shared<Int64Scalar>(large_scalar);
+  auto array = MakeArrayFromScalar(*scalar, length);
+
+#ifndef ARROW_UBSAN

Review Comment:
   I think you can remove this condition if you use a `uint64` instead, as 
unsigned overflow doesn't trigger UB.



##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -360,6 +380,10 @@ TYPED_TEST(TestNumericSumKernel, SimpleSum) {
               
ResultWith(Datum(std::make_shared<ScalarType>(static_cast<T>(5)))));
   EXPECT_THAT(Sum(MakeNullScalar(TypeTraits<TypeParam>::type_singleton())),
               
ResultWith(Datum(MakeNullScalar(TypeTraits<SumType>::type_singleton()))));
+  
EXPECT_THAT(SumChecked(Datum(std::make_shared<InputScalarType>(static_cast<T>(5)))),
+              
ResultWith(Datum(std::make_shared<ScalarType>(static_cast<T>(5)))));
+  
EXPECT_THAT(SumChecked(MakeNullScalar(TypeTraits<TypeParam>::type_singleton())),
+              
ResultWith(Datum(MakeNullScalar(TypeTraits<SumType>::type_singleton()))));

Review Comment:
   Not sure that would work, but instead of repeating these lines, perhaps you 
can iterate over the sum functions:
   ```c++
     for (auto&& sum_func : {Sum, SumChecked}) ...
   ```
   
   



##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -592,25 +652,51 @@ TEST(TestNullSumKernel, Basics) {
   Datum null_result = std::make_shared<Int64Scalar>();
   Datum zero_result = std::make_shared<Int64Scalar>(0);
 
-  EXPECT_THAT(Sum(ScalarFromJSON(ty, "null")), ResultWith(null_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]")), ResultWith(null_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]")), ResultWith(null_result));
-  EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"})),
-              ResultWith(null_result));
+  for (auto func : {Sum, SumChecked}) {
+    SCOPED_TRACE(func);
+    auto default_options = ScalarAggregateOptions::Defaults();
+    EXPECT_THAT(func(ScalarFromJSON(ty, "null"), default_options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), default_options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), default_options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, 
null]"}),
+                     default_options, nullptr),
+                ResultWith(null_result));
 
-  ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
-  EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options), 
ResultWith(zero_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options), 
ResultWith(zero_result));
-  EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}), 
options),
-              ResultWith(zero_result));
+    ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
+    EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+                ResultWith(zero_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr), 
ResultWith(zero_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+                ResultWith(zero_result));
+    EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, 
null]"}), options,
+                     nullptr),
+                ResultWith(zero_result));
+
+    options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
+    EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr), 
ResultWith(zero_result));
+    EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+                ResultWith(null_result));
+    EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, 
null]"}), options,
+                     nullptr),
+                ResultWith(null_result));
+  }
+}
 
-  options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
-  EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options), 
ResultWith(null_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
-  EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options), 
ResultWith(null_result));
-  EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}), 
options),
-              ResultWith(null_result));
+TEST(TestSumKernel, Overflow) {
+  int64_t large_scalar = 1000000000000000;
+  int64_t length = 10000;
+  auto scalar = std::make_shared<Int64Scalar>(large_scalar);

Review Comment:
   Can we test overflow using more types (including decimals)? It should be 
relatively simple even without templating if you use `ScalarFromJSON`.
   
   Also, this is testing overflow when summing a broadcast scalar, but we 
should also test overflow within an array.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to