This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f8184d57fb GH-50105: [C++][Python] Fix sliced sparse union null checks
(#50108)
f8184d57fb is described below
commit f8184d57fbc03f39f7961a503db65b58eac46fd1
Author: fenfeng9 <[email protected]>
AuthorDate: Wed Jun 10 22:49:39 2026 +0800
GH-50105: [C++][Python] Fix sliced sparse union null checks (#50108)
### Rationale for this change
Sliced sparse unions could report incorrect nullness during element access.
The type id lookup used the union offset, but the child null check still
used the slice-relative index.
### What changes are included in this PR?
* Updates `IsNullSparseUnion` and `ArraySpan::IsNullSparseUnion` in
`data.cc` to use the parent offset when checking child nullness.
* Add C++ and Python regression tests.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #50105
Authored-by: fenfeng9 <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/array/array_union_test.cc | 45 +++++++++++++++++++++++++++++++++
cpp/src/arrow/array/data.cc | 4 +--
python/pyarrow/tests/test_array.py | 35 +++++++++++++++++++++++++
3 files changed, 82 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/array/array_union_test.cc
b/cpp/src/arrow/array/array_union_test.cc
index 77ba247779..ec93329d08 100644
--- a/cpp/src/arrow/array/array_union_test.cc
+++ b/cpp/src/arrow/array/array_union_test.cc
@@ -134,6 +134,51 @@ TEST(TestSparseUnionArray, GetFlattenedField) {
}
}
+TEST(TestSparseUnionArray, SliceIsNull) {
+ // GH-50105: Sliced sparse union element access can return incorrect values.
+ auto type_ids = ArrayFromJSON(int8(), "[0, 1, 0, 1, 0]");
+ auto ints = ArrayFromJSON(int64(), "[null, 20, 30, 40, 50]");
+ auto strs = ArrayFromJSON(utf8(), R"(["a", "b", "c", null, "e"])");
+
+ // arr == [null, "b", 30, null, 50].
+ ASSERT_OK_AND_ASSIGN(const auto arr, SparseUnionArray::Make(*type_ids,
{ints, strs}));
+
+ // Check that IsNull() returns the expected logical nullness for sliced
entries.
+ auto check_slice = [](const std::shared_ptr<SparseUnionArray>& sliced,
+ const std::initializer_list<bool> expected_is_null) {
+ ASSERT_EQ(sliced->length(), static_cast<int64_t>(expected_is_null.size()));
+
+ ArraySpan span(*sliced->data());
+ int64_t i = 0;
+ for (bool expected : expected_is_null) {
+ ASSERT_EQ(expected, sliced->IsNull(i));
+ ASSERT_EQ(expected, span.IsNull(i));
+ ++i;
+ }
+ };
+
+ // slice(1, 4) == ["b", 30, null, 50].
+ auto sliced_1_4 = checked_pointer_cast<SparseUnionArray>(arr->Slice(1, 4));
+ check_slice(sliced_1_4, {false, false, true, false});
+
+ // Check that the parent and child offsets compose correctly.
+ auto ints_with_offset =
+ ArrayFromJSON(int64(), "[999, null, 20, 30, 40, 50]")->Slice(1, 5);
+ auto strs_with_offset =
+ ArrayFromJSON(utf8(), R"(["z", "a", "b", "c", null, "e"])")->Slice(1, 5);
+
+ // arr_with_sliced_children == [null, "b", 30, null, 50].
+ ASSERT_OK_AND_ASSIGN(
+ const auto arr_with_sliced_children,
+ SparseUnionArray::Make(*type_ids, {ints_with_offset, strs_with_offset}));
+
+ // arr_with_sliced_children.slice(1, 4) == ["b", 30, null, 50].
+ auto sliced_children_1_4 =
+
checked_pointer_cast<SparseUnionArray>(arr_with_sliced_children->Slice(1, 4));
+
+ check_slice(sliced_children_1_4, {false, false, true, false});
+}
+
TEST(TestSparseUnionArray, Validate) {
auto a = ArrayFromJSON(int32(), "[4, 5]");
auto type = sparse_union({field("a", int32())});
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index cb7959174a..dcc659e709 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -73,7 +73,7 @@ bool IsNullSparseUnion(const ArrayData& data, int64_t i) {
auto* union_type = checked_cast<const SparseUnionType*>(data.type.get());
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
const int child_id = union_type->child_ids()[types[data.offset + i]];
- return data.child_data[child_id]->IsNull(i);
+ return data.child_data[child_id]->IsNull(data.offset + i);
}
bool IsNullDenseUnion(const ArrayData& data, int64_t i) {
@@ -698,7 +698,7 @@ bool ArraySpan::IsNullSparseUnion(int64_t i) const {
auto* union_type = checked_cast<const SparseUnionType*>(this->type);
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
const int child_id = union_type->child_ids()[types[this->offset + i]];
- return this->child_data[child_id].IsNull(i);
+ return this->child_data[child_id].IsNull(this->offset + i);
}
bool ArraySpan::IsNullDenseUnion(int64_t i) const {
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index a103519dc5..2af866c775 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1532,6 +1532,41 @@ def test_union_array_to_pylist_with_nulls():
assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False]
+def test_sparse_union_array_slice_with_nulls():
+ # GH-50105: Sliced sparse union element access can return incorrect values.
+ type_ids = pa.array([0, 1, 0, 1, 0], type=pa.int8())
+ arr = pa.UnionArray.from_sparse(
+ type_ids,
+ [
+ pa.array([None, 20, 30, 40, 50]),
+ pa.array(["a", "b", "c", None, "e"]),
+ ],
+ )
+
+ # arr == [None, "b", 30, None, 50].
+ def check_values(array, expected):
+ assert [array[i].as_py() for i in range(len(array))] == expected
+ assert array.to_pylist() == expected
+
+ check_values(arr, [None, "b", 30, None, 50])
+
+ # arr.slice(1, 4) == ["b", 30, None, 50].
+ check_values(arr.slice(1, 4), ["b", 30, None, 50])
+
+ ints_with_offset = pa.array([999, None, 20, 30, 40, 50],
+ type=pa.int64())[1:]
+ strs_with_offset = pa.array(["z", "a", "b", "c", None, "e"])[1:]
+ arr_with_sliced_children = pa.UnionArray.from_sparse(
+ type_ids, [ints_with_offset, strs_with_offset]
+ )
+
+ # arr_with_sliced_children == [None, "b", 30, None, 50].
+ check_values(arr_with_sliced_children, [None, "b", 30, None, 50])
+
+ # arr_with_sliced_children.slice(1, 4) == ["b", 30, None, 50].
+ check_values(arr_with_sliced_children.slice(1, 4), ["b", 30, None, 50])
+
+
def test_union_array_slice():
# ARROW-2314
arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()),