This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1b3f313009 GH-49888: [C++][Compute] Fix count for run-end encoded
arrays with nulls (#49908)
1b3f313009 is described below
commit 1b3f31300979cd6976a2125a8dccfa7c049706ca
Author: fenfeng9 <[email protected]>
AuthorDate: Wed May 6 22:24:03 2026 +0800
GH-49888: [C++][Compute] Fix count for run-end encoded arrays with nulls
(#49908)
### Rationale for this change
The `count` kernel used `GetNullCount()`, which reports the physical null
count. For run-end encoded arrays, this ignored nulls in the encoded values
child.
### What changes are included in this PR?
Use `ComputeLogicalNullCount()` in the `count` kernel so run-end encoded
arrays are counted correctly. Add C++ and Python tests for this case.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #49888
Authored-by: fenfeng9 <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +-
cpp/src/arrow/compute/kernels/aggregate_test.cc | 10 ++++++++++
python/pyarrow/tests/test_compute.py | 12 ++++++++++++
3 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 03fba53ac0..f7ff72bc11 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -103,7 +103,7 @@ struct CountImpl : public ScalarAggregator {
this->non_nulls += batch.length;
} else if (batch[0].is_array()) {
const ArraySpan& input = batch[0].array;
- const int64_t nulls = input.GetNullCount();
+ const int64_t nulls = input.ComputeLogicalNullCount();
this->nulls += nulls;
this->non_nulls += input.length - nulls;
} else {
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index c5ba012d76..6783475db3 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -941,6 +941,16 @@ TYPED_TEST(TestCountKernel, SimpleCount) {
EXPECT_THAT(Count(*MakeScalar(ty, 1), all), ResultWith(Datum(int64_t(1))));
}
+TEST(TestCountKernel, RunEndEncodedNulls) {
+ auto input = ArrayFromJSON(int32(), "[1, 1, null, null, null, 2, 2, 2, null,
3]");
+ ASSERT_OK_AND_ASSIGN(auto encoded, RunEndEncode(input));
+
+ auto array = encoded.make_array();
+ ValidateCount(*array, {6, 4});
+ // Logical slice: [null, null, 2, 2, 2, null].
+ ValidateCount(*array->Slice(3, 6), {3, 3});
+}
+
template <typename ArrowType>
class TestRandomNumericCountKernel : public ::testing::Test {};
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index 8c3b09f612..4e44a912d9 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -2849,6 +2849,18 @@ def test_count():
pc.count(arr, 'something else')
+def test_count_run_end_encoded_nulls():
+ arr = pc.run_end_encode(
+ pa.array([1, 1, None, None, None, 2, 2, 2, None, 3]))
+
+ assert pc.count(arr, mode="only_valid").as_py() == 6
+ assert pc.count(arr, mode="only_null").as_py() == 4
+ assert pc.count(arr, mode="all").as_py() == 10
+ # Slice crosses run boundaries: logical [None, None, 2, 2, 2, None].
+ assert pc.count(arr.slice(3, 6), mode="only_valid").as_py() == 3
+ assert pc.count(arr.slice(3, 6), mode="only_null").as_py() == 3
+
+
def test_index():
arr = pa.array([0, 1, None, 3, 4], type=pa.int64())
assert pc.index(arr, pa.scalar(0)).as_py() == 0