This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 5f12de2ffb GH-49310: [C++][Compute] Fix segmentation fault in 
pyarrow.compute.if_else (#49375)
5f12de2ffb is described below

commit 5f12de2ffb4ad7a24c2b9dff0d6aac3f84ef3989
Author: PaweÅ‚ Biegun <[email protected]>
AuthorDate: Tue Mar 10 01:05:44 2026 -0700

    GH-49310: [C++][Compute] Fix segmentation fault in pyarrow.compute.if_else 
(#49375)
    
    ### Rationale for this change
    Fixing #49310
    
    ### What changes are included in this PR?
    I added a check for the if else compute kernels that verify that the result 
will fit in the offset of the result. The check asserts that no matter which 
values are chosen from either the left or right arrays/scalars there will be no 
overflow in order to use the UnsafeAppend which skips almost all safety checks.
    
    ### Are these changes tested?
    Yes, There are tests for each of the kernels that include an array that the 
values that would overflow the 32 bit index trigger the check. These tests 
don't actually allocate the data to save on time and compute. There are also 
two tests that verify an overflow for a 32 bit offset type and trigger an error 
and the second one that ensures that for a data type with a 64 bit offset there 
is no overflow error.
    
    ### Are there any user-facing changes?
    Yes, if the result may not fit in a type that relies on 32 bit offsets the 
user will receive a helpful error message instead of a segfault.
    * GitHub Issue: #49310
    
    Lead-authored-by: PaweÅ‚ Biegun <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/compute/kernels/scalar_if_else.cc    | 24 +++++-
 .../arrow/compute/kernels/scalar_if_else_test.cc   | 85 ++++++++++++++++++++++
 2 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc 
b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index d32070f5c0..ed32ec203c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <cstring>
+#include <limits>
 #include "arrow/array/builder_nested.h"
 #include "arrow/array/builder_primitive.h"
 #include "arrow/array/builder_time.h"
@@ -688,6 +689,17 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
 
+  static Status ValidateCapacityForOffsetType(int64_t data_bytes) {
+    int64_t max_offset = 
static_cast<int64_t>(std::numeric_limits<OffsetType>::max());
+    if (data_bytes > max_offset) {
+      return Status::CapacityError("Result may exceed offset capacity for this 
type: ",
+                                   data_bytes, " > ", max_offset,
+                                   ". Convert inputs to a type that uses an 
int64 based "
+                                   "offset such as a large_string");
+    }
+    return Status::OK();
+  }
+
   // A - Array, S - Scalar, X = Array/Scalar
 
   // SXX
@@ -712,9 +724,13 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     const uint8_t* right_data = right.buffers[2].data;
 
     // allocate data buffer conservatively
-    int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
-                              right_offsets[right.length] - right_offsets[0];
+    int64_t data_buff_alloc = (static_cast<int64_t>(left_offsets[left.length]) 
-
+                               static_cast<int64_t>(left_offsets[0])) +
+                              
(static_cast<int64_t>(right_offsets[right.length]) -
+                               static_cast<int64_t>(right_offsets[0]));
 
+    // output a nicer error message if the heuristic overflows max capacity
+    ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(data_buff_alloc));
     BuilderType builder(ctx->memory_pool());
     ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
@@ -758,6 +774,8 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     int64_t data_buff_alloc =
         left_size * cond.length + right_offsets[right.length] - 
right_offsets[0];
 
+    // output a nicer error message if the heuristic overflows max capacity
+    ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(data_buff_alloc));
     BuilderType builder(ctx->memory_pool());
     ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
@@ -798,6 +816,8 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     int64_t data_buff_alloc =
         right_size * cond.length + left_offsets[left.length] - left_offsets[0];
 
+    // output a nicer error message if the heuristic overflows max capacity
+    ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(data_buff_alloc));
     BuilderType builder(ctx->memory_pool());
     ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 85d152aa8c..e5cf73742b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <limits>
 #include <numeric>
 
 #include <gtest/gtest.h>
@@ -608,6 +609,90 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
   CheckIfElseOutput(cond, left, right, expected_data);
 }
 
+Result<std::shared_ptr<Array>> MakeBinaryArrayWithData(
+    const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& 
data_buffer) {
+  // Make a (large-)binary array with a single item backed by the given data
+  const auto data_length = data_buffer->size();
+  if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+    if (data_length > std::numeric_limits<int32_t>::max()) {
+      return Status::Invalid("data_length exceeds int32 offset range");
+    }
+    auto offsets =
+        Buffer::FromVector(std::vector<int32_t>{0, 
static_cast<int32_t>(data_length)});
+    return MakeArray(
+        ArrayData::Make(type, 1, {nullptr, std::move(offsets), 
std::move(data_buffer)}));
+  }
+  if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+    return Status::TypeError("unsupported var-binary type for helper: ", 
*type);
+  }
+  auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+  return MakeArray(
+      ArrayData::Make(type, 1, {nullptr, std::move(offsets), 
std::move(data_buffer)}));
+}
+
+void CheckIfElseCapacityBehavior(const Datum& cond, const Datum& left, const 
Datum& right,
+                                 bool expect_capacity_error) {
+  auto maybe_out = CallFunction("if_else", {cond, left, right});
+  if (expect_capacity_error) {
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        CapacityError,
+        ::testing::HasSubstr("Result may exceed offset capacity for this 
type"),
+        maybe_out);
+  } else {
+    ASSERT_OK(maybe_out);
+  }
+}
+
+TEST_F(TestIfElseKernel, LARGE_MEMORY_TEST(IfElseBinaryAAANear2GB)) {
+  // See GH-49310.
+  // `kPerSide` is below the capacity limit for a single binary array,
+  // but trying to allocate twice `kPerSide` would overflow the capacity limit.
+  constexpr int64_t kPerSide =
+      static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+  auto cond = ArrayFromJSON(boolean(), "[true]");
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> large_buffer, 
AllocateBuffer(kPerSide));
+
+  ASSERT_OK_AND_ASSIGN(auto binary_left, MakeBinaryArrayWithData(binary(), 
large_buffer));
+  ASSERT_OK_AND_ASSIGN(auto binary_right,
+                       MakeBinaryArrayWithData(binary(), large_buffer));
+  // The if_else heuristic for preallocation is too crude and fails with
+  // a capacity error as it would like to pre-allocate more than 2GiB
+  // of binary data.
+  CheckIfElseCapacityBehavior(cond, binary_left, binary_right,
+                              /*expect_capacity_error=*/true);
+  ASSERT_OK_AND_ASSIGN(auto large_binary_left,
+                       MakeBinaryArrayWithData(large_binary(), large_buffer));
+  ASSERT_OK_AND_ASSIGN(auto large_binary_right,
+                       MakeBinaryArrayWithData(large_binary(), large_buffer));
+  CheckIfElseCapacityBehavior(cond, large_binary_left, large_binary_right,
+                              /*expect_capacity_error=*/false);
+}
+
+TEST_F(TestIfElseKernel, LARGE_MEMORY_TEST(IfElseBinaryASANear2GB)) {
+  constexpr int64_t kPerSide =
+      static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+  auto cond = ArrayFromJSON(boolean(), "[true]");
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> large_buffer, 
AllocateBuffer(kPerSide));
+  ASSERT_OK_AND_ASSIGN(auto binary_array,
+                       MakeBinaryArrayWithData(binary(), large_buffer));
+  ASSERT_OK_AND_ASSIGN(auto binary_scalar, MakeScalar(binary(), large_buffer));
+  CheckIfElseCapacityBehavior(cond, binary_scalar, binary_array,
+                              /*expect_capacity_error=*/true);
+  CheckIfElseCapacityBehavior(cond, binary_array, binary_scalar,
+                              /*expect_capacity_error=*/true);
+
+  ASSERT_OK_AND_ASSIGN(auto large_binary_array,
+                       MakeBinaryArrayWithData(large_binary(), large_buffer));
+  ASSERT_OK_AND_ASSIGN(auto large_binary_scalar,
+                       MakeScalar(large_binary(), large_buffer));
+  CheckIfElseCapacityBehavior(cond, large_binary_scalar, large_binary_array,
+                              /*expect_capacity_error=*/false);
+  CheckIfElseCapacityBehavior(cond, large_binary_array, large_binary_scalar,
+                              /*expect_capacity_error=*/false);
+}
+
 TEST_F(TestIfElseKernel, IfElseFSBinary) {
   auto type = fixed_size_binary(4);
 

Reply via email to