This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3a9d9bc716 GH-49410: [C++] Fix if_else null-scalar fast paths for
sliced BaseBinary arrays (#49443)
3a9d9bc716 is described below
commit 3a9d9bc716afce97a4b7c802e2f5892cdd3bda72
Author: Ebraam <[email protected]>
AuthorDate: Thu Mar 12 13:41:48 2026 +0200
GH-49410: [C++] Fix if_else null-scalar fast paths for sliced BaseBinary
arrays (#49443)
### Rationale for this change
`if_else` with a null scalar and a sliced `BaseBinary` array (`offsets[0]
!= 0`) produces invalid output. The ASA and AAS shortcut paths in
`scalar_if_else.cc` copy offsets without adjusting for the slice offset, and
copy data from byte 0 instead of `data + offsets[0]`.
### What changes are included in this PR?
**Fix** (`scalar_if_else.cc`): In both the ASA and AAS fast paths:
- Replace the single `memcpy` for the offsets buffer with a loop that
normalizes each offset by subtracting `offsets[0]` as a base, so output offsets
always start at 0.
- Change the data copy to start from `data + base` instead of `data`, so
only the bytes belonging to the slice are copied.
**Regression tests (`scalar_if_else_test.cc`):**
- `IfElseBaseBinarySliced`: covers sliced arrays (`array.offset != 0`) for
both ASA and AAS paths.
- `IfElseBaseBinaryNonZeroFirst`: covers the edge case where `array.offset
== 0` but `offsets[0] != 0`, manually constructed via `ArrayData::Make()`.
### Are these changes tested?
Yes. The new `IfElseBaseBinarySliced` typed test reproduces the bug and
passes after the fix. All 416 existing tests continue to pass.
### Are there any user-facing changes?
Yes. The bug causes incorrect/invalid data to be produced when `if_else` is
called with a null scalar and a sliced `BaseBinary` array.
* GitHub Issue: #49410
Lead-authored-by: Ebraam <[email protected]>
Co-authored-by: Ebraam <[email protected]>
Co-authored-by: Ibraam-Ashraf
<[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/compute/kernels/scalar_if_else.cc | 30 ++++++++++--
.../arrow/compute/kernels/scalar_if_else_test.cc | 53 ++++++++++++++++++++++
2 files changed, 79 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index ed32ec203c..0c5cfe1c90 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -759,11 +759,22 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
auto* out_data = out->array_data().get();
auto offset_length = (cond.length + 1) * sizeof(OffsetType);
ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
ctx->Allocate(offset_length));
- std::memcpy(out_data->buffers[1]->mutable_data(), right_offsets,
offset_length);
+
+ if (right_offsets[0] == 0) {
+ std::memcpy(out_data->buffers[1]->mutable_data(), right_offsets,
offset_length);
+ } else {
+ OffsetType base = right_offsets[0];
+ auto* out_offsets =
+
reinterpret_cast<OffsetType*>(out_data->buffers[1]->mutable_data());
+ for (int64_t i = 0; i <= cond.length; ++i) {
+ out_offsets[i] = right_offsets[i] - base;
+ }
+ }
auto right_data_length = right_offsets[right.length] - right_offsets[0];
ARROW_ASSIGN_OR_RAISE(out_data->buffers[2],
ctx->Allocate(right_data_length));
- std::memcpy(out_data->buffers[2]->mutable_data(), right_data,
right_data_length);
+ std::memcpy(out_data->buffers[2]->mutable_data(), right_data +
right_offsets[0],
+ right_data_length);
return Status::OK();
}
@@ -801,11 +812,22 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
auto* out_data = out->array_data().get();
auto offset_length = (cond.length + 1) * sizeof(OffsetType);
ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
ctx->Allocate(offset_length));
- std::memcpy(out_data->buffers[1]->mutable_data(), left_offsets,
offset_length);
+
+ if (left_offsets[0] == 0) {
+ std::memcpy(out_data->buffers[1]->mutable_data(), left_offsets,
offset_length);
+ } else {
+ OffsetType base = left_offsets[0];
+ auto* out_offsets =
+
reinterpret_cast<OffsetType*>(out_data->buffers[1]->mutable_data());
+ for (int64_t i = 0; i <= cond.length; ++i) {
+ out_offsets[i] = left_offsets[i] - base;
+ }
+ }
auto left_data_length = left_offsets[left.length] - left_offsets[0];
ARROW_ASSIGN_OR_RAISE(out_data->buffers[2],
ctx->Allocate(left_data_length));
- std::memcpy(out_data->buffers[2]->mutable_data(), left_data,
left_data_length);
+ std::memcpy(out_data->buffers[2]->mutable_data(), left_data +
left_offsets[0],
+ left_data_length);
return Status::OK();
}
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index e5cf73742b..6fdcff8d97 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -27,6 +27,7 @@
#include "arrow/compute/kernels/test_util_internal.h"
#include "arrow/compute/registry.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/util/bitmap_builders.h"
#include "arrow/util/checked_cast.h"
namespace arrow {
@@ -609,6 +610,58 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinarySliced) {
+ auto type = TypeTraits<TypeParam>::type_singleton();
+
+ auto full_arr = ArrayFromJSON(type, R"(["not used", null, "x", "x"])");
+ auto sliced = full_arr->Slice(1);
+ auto expected = ArrayFromJSON(type, R"([null, "x", "x"])");
+
+ auto cond_asa = ArrayFromJSON(boolean(), "[true, false, false]");
+ ASSERT_OK_AND_ASSIGN(auto result_asa,
+ CallFunction("if_else", {cond_asa,
MakeNullScalar(type), sliced}));
+ ASSERT_OK(result_asa.make_array()->ValidateFull());
+ AssertArraysEqual(*expected, *result_asa.make_array(), true);
+
+ auto cond_aas = ArrayFromJSON(boolean(), "[false, true, true]");
+ ASSERT_OK_AND_ASSIGN(auto result_aas,
+ CallFunction("if_else", {cond_aas, sliced,
MakeNullScalar(type)}));
+ ASSERT_OK(result_aas.make_array()->ValidateFull());
+ AssertArraysEqual(*expected, *result_aas.make_array(), true);
+}
+
+// array offset=0 but offsets[0] != 0
+TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryNonZeroFirst) {
+ auto type = TypeTraits<TypeParam>::type_singleton();
+ using OffsetType = typename TypeTraits<TypeParam>::OffsetType::c_type;
+
+ std::vector<OffsetType> raw_offsets = {8, 8, 9, 10};
+ std::string raw_data(8, 'p');
+ raw_data += "ab";
+ auto offsets_buf = Buffer::Wrap(raw_offsets.data(), raw_offsets.size());
+ auto data_buf = Buffer::Wrap(raw_data.data(), raw_data.size());
+ auto array_data = ArrayData::Make(type, /*length=*/3, {nullptr, offsets_buf,
data_buf},
+ /*null_count=*/1, /*offset=*/0);
+ std::vector<uint8_t> validity_bytes = {0, 1, 1};
+ ASSERT_OK_AND_ASSIGN(array_data->buffers[0],
+ internal::BytesToBits(validity_bytes,
default_memory_pool()));
+ auto arr = MakeArray(array_data);
+ ASSERT_OK(arr->ValidateFull());
+ auto expected = ArrayFromJSON(type, R"([null, "a", "b"])");
+
+ auto cond_asa = ArrayFromJSON(boolean(), "[true, false, false]");
+ ASSERT_OK_AND_ASSIGN(auto result_asa,
+ CallFunction("if_else", {cond_asa,
MakeNullScalar(type), arr}));
+ ASSERT_OK(result_asa.make_array()->ValidateFull());
+ AssertArraysEqual(*expected, *result_asa.make_array(), true);
+
+ auto cond_aas = ArrayFromJSON(boolean(), "[false, true, true]");
+ ASSERT_OK_AND_ASSIGN(auto result_aas,
+ CallFunction("if_else", {cond_aas, arr,
MakeNullScalar(type)}));
+ ASSERT_OK(result_aas.make_array()->ValidateFull());
+ AssertArraysEqual(*expected, *result_aas.make_array(), true);
+}
+
Result<std::shared_ptr<Array>> MakeBinaryArrayWithData(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>&
data_buffer) {
// Make a (large-)binary array with a single item backed by the given data