This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new fea7897a78 GH-50113: [C++][Python] Fix `count` for sliced union arrays
(#50114)
fea7897a78 is described below
commit fea7897a78eebe51579f712f5238bb3c50f2e7c9
Author: fenfeng9 <[email protected]>
AuthorDate: Thu Jun 11 00:00:32 2026 +0800
GH-50113: [C++][Python] Fix `count` for sliced union arrays (#50114)
### Rationale for this change
Sliced union arrays could report incorrect results from `count`.
For sliced union arrays, `span.GetValues<int8_t>(1)` and
`span.GetValues<int32_t>(2)` already return offset-adjusted pointers. The
logical null count path then indexed those pointers with `span.offset + i`,
effectively applying the parent offset twice.
The sparse union path also checked child nullness with `i` instead of
`span.offset + i`.
### What changes are included in this PR?
* Fixes logical null counting for sliced sparse and dense union arrays.
* Add C++ and Python regression tests.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #50113
Authored-by: fenfeng9 <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/compute/kernels/aggregate_test.cc | 30 +++++++++++++++++++
cpp/src/arrow/util/union_util.cc | 8 ++---
python/pyarrow/tests/test_compute.py | 39 +++++++++++++++++++++++++
3 files changed, 73 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 6783475db3..bfc3442a73 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -951,6 +951,36 @@ TEST(TestCountKernel, RunEndEncodedNulls) {
ValidateCount(*array->Slice(3, 6), {3, 3});
}
+TEST(TestCountKernel, SparseUnionSlicedNulls) {
+ // GH-50113: Sliced unions can report incorrect null counts in count.
+ auto type_ids = ArrayFromJSON(int8(), "[0, 1, 0, 0, 1, 1]");
+ ArrayVector children = {
+ ArrayFromJSON(float64(), "[0.5, 99.0, null, 3.0, 88.0, 77.0]"),
+ ArrayFromJSON(boolean(), "[false, null, true, false, true, false]")};
+ ASSERT_OK_AND_ASSIGN(auto array,
+ SparseUnionArray::Make(*type_ids, std::move(children)));
+
+ // Logical array: [0.5, null, null, 3.0, true, false].
+ ValidateCount(*array, {4, 2});
+ // Logical slice: [null, null, 3.0, true].
+ ValidateCount(*array->Slice(1, 4), {2, 2});
+}
+
+TEST(TestCountKernel, DenseUnionSlicedNulls) {
+ // GH-50113: Sliced unions can report incorrect null counts in count.
+ auto type_ids = ArrayFromJSON(int8(), "[0, 1, 0, 0, 1, 1]");
+ auto value_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1, 2]");
+ ArrayVector children = {ArrayFromJSON(float64(), "[0.5, null, 3.0]"),
+ ArrayFromJSON(boolean(), "[null, true, false]")};
+ ASSERT_OK_AND_ASSIGN(
+ auto array, DenseUnionArray::Make(*type_ids, *value_offsets,
std::move(children)));
+
+ // Logical array: [0.5, null, null, 3.0, true, false].
+ ValidateCount(*array, {4, 2});
+ // Logical slice: [null, null, 3.0, true].
+ ValidateCount(*array->Slice(1, 4), {2, 2});
+}
+
template <typename ArrowType>
class TestRandomNumericCountKernel : public ::testing::Test {};
diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc
index 7f2a150db8..6b4d752d86 100644
--- a/cpp/src/arrow/util/union_util.cc
+++ b/cpp/src/arrow/util/union_util.cc
@@ -33,9 +33,9 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span) {
const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
int64_t null_count = 0;
for (int64_t i = 0; i < span.length; i++) {
- const int8_t child_id = sparse_union_type->child_ids()[types[span.offset +
i]];
+ const int8_t child_id = sparse_union_type->child_ids()[types[i]];
- null_count += span.child_data[child_id].IsNull(i);
+ null_count += span.child_data[child_id].IsNull(span.offset + i);
}
return null_count;
}
@@ -48,8 +48,8 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) {
const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT
int64_t null_count = 0;
for (int64_t i = 0; i < span.length; i++) {
- const int8_t child_id = dense_union_type->child_ids()[types[span.offset +
i]];
- const int32_t offset = offsets[span.offset + i];
+ const int8_t child_id = dense_union_type->child_ids()[types[i]];
+ const int32_t offset = offsets[i];
null_count += span.child_data[child_id].IsNull(offset);
}
return null_count;
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index 1c82d6c944..4e7f506ba5 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -2889,6 +2889,45 @@ def test_count_run_end_encoded_nulls():
assert pc.count(arr.slice(3, 6), mode="only_null").as_py() == 3
+def test_count_sparse_union_sliced_nulls():
+ # GH-50113: Sliced unions can report incorrect null counts in count.
+ arr = pa.UnionArray.from_sparse(
+ pa.array([0, 1, 0, 0, 1, 1], type=pa.int8()),
+ [
+ pa.array([0.5, 99.0, None, 3.0, 88.0, 77.0]),
+ pa.array([False, None, True, False, True, False]),
+ ]
+ )
+
+ # Logical array: [0.5, None, None, 3.0, True, False].
+ assert pc.count(arr, mode="only_valid").as_py() == 4
+ assert pc.count(arr, mode="only_null").as_py() == 2
+ assert pc.count(arr, mode="all").as_py() == 6
+ # Logical slice: [None, None, 3.0, True].
+ assert pc.count(arr.slice(1, 4), mode="only_valid").as_py() == 2
+ assert pc.count(arr.slice(1, 4), mode="only_null").as_py() == 2
+
+
+def test_count_dense_union_sliced_nulls():
+ # GH-50113: Sliced unions can report incorrect null counts in count.
+ arr = pa.UnionArray.from_dense(
+ pa.array([0, 1, 0, 0, 1, 1], type=pa.int8()),
+ pa.array([0, 0, 1, 2, 1, 2], type=pa.int32()),
+ [
+ pa.array([0.5, None, 3.0]),
+ pa.array([None, True, False]),
+ ]
+ )
+
+ # Logical array: [0.5, None, None, 3.0, True, False].
+ assert pc.count(arr, mode="only_valid").as_py() == 4
+ assert pc.count(arr, mode="only_null").as_py() == 2
+ assert pc.count(arr, mode="all").as_py() == 6
+ # Logical slice: [None, None, 3.0, True].
+ assert pc.count(arr.slice(1, 4), mode="only_valid").as_py() == 2
+ assert pc.count(arr.slice(1, 4), mode="only_null").as_py() == 2
+
+
def test_index():
arr = pa.array([0, 1, None, 3, 4], type=pa.int64())
assert pc.index(arr, pa.scalar(0)).as_py() == 0