This is an automated email from the ASF dual-hosted git repository.
rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 95cb5d0845 GH-50043: [C++][Python] Fix hash_any/hash_all on sliced
boolean arrays (#50094)
95cb5d0845 is described below
commit 95cb5d084506f97a16df006053642e9690ab731d
Author: fenfeng9 <[email protected]>
AuthorDate: Thu Jun 4 22:02:52 2026 +0800
GH-50043: [C++][Python] Fix hash_any/hash_all on sliced boolean arrays
(#50094)
### Rationale for this change
`hash_any` and `hash_all` could return incorrect results for sliced
nullable boolean arrays.
The validity bitmap used the slice offset, but the boolean values bitmap
did not.
### What changes are included in this PR?
Apply the slice offset when reading boolean values in `hash_any` /
`hash_all`.
Add C++ and Python regression tests.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #50043
Authored-by: fenfeng9 <[email protected]>
Signed-off-by: Rok Mihevc <[email protected]>
---
cpp/src/arrow/acero/hash_aggregate_test.cc | 38 +++++++++++++++++++++++++
cpp/src/arrow/compute/kernels/hash_aggregate.cc | 3 +-
python/pyarrow/tests/test_table.py | 32 +++++++++++++++++++++
3 files changed, 72 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc
b/cpp/src/arrow/acero/hash_aggregate_test.cc
index 12d24429cb..442dcd5883 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -2157,6 +2157,44 @@ TEST_P(GroupBy, AnyAndAll) {
}
}
+TEST_P(GroupBy, AnyAllSlicedNullableBoolean) {
+ auto table = TableFromJSON(schema({field("any_arg", boolean()),
+ field("all_arg", boolean()), field("key",
int64())}),
+ {R"([
+ [true, false, 99],
+ [false, true, 10],
+ [null, null, 10]
+ ])"});
+ auto sliced = table->Slice(1);
+
+ // GH-50043: hash_any/hash_all should respect the slice offset.
+ // After Slice(1), any_arg=[false, null] and all_arg=[true, null].
+ auto expected = ArrayFromJSON(struct_({
+ field("key_0", int64()),
+ field("hash_any", boolean()),
+ field("hash_all", boolean()),
+ }),
+ R"([
+ [10, false, true]
+ ])");
+
+ for (bool use_threads : {true, false}) {
+ SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+
+ ASSERT_OK_AND_ASSIGN(auto actual,
GroupByTest({sliced->GetColumnByName("any_arg"),
+
sliced->GetColumnByName("all_arg")},
+
{sliced->GetColumnByName("key")},
+ {
+ {"hash_any", nullptr},
+ {"hash_all", nullptr},
+ },
+ use_threads));
+ ValidateOutput(actual);
+
+ AssertDatumsEqual(expected, actual, /*verbose=*/true);
+ }
+}
+
TEST_P(GroupBy, AnyAllScalar) {
BatchesWithSchema input;
input.batches = {
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 8359945319..d07096236d 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1261,7 +1261,8 @@ struct GroupedBooleanAggregator : public
GroupedAggregator {
input.buffers[0].data, input.offset, input.length,
[&](int64_t position) {
counts[*g]++;
- Impl::UpdateGroupWith(reduced, *g, bit_util::GetBit(bitmap,
position));
+ Impl::UpdateGroupWith(reduced, *g,
+ bit_util::GetBit(bitmap, input.offset +
position));
g++;
},
[&] { bit_util::SetBitTo(no_nulls, *g++, false); });
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index b65fb7d952..c6dbbc5145 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -2976,6 +2976,38 @@ def test_table_group_by():
}
[email protected]
+def test_group_by_sliced_any_all():
+ # GH-50043: hash_any/hash_all produce incorrect results on sliced boolean
arrays
+ # Row 0 will be discarded by slice, should not affect aggregation
+ table = pa.table(
+ {
+ "g": [99, 10, 10],
+ "any_arg": [True, False, None],
+ "all_arg": [False, True, None],
+ }
+ )
+ sliced = table.slice(1)
+
+ expected = pa.table(
+ {
+ "g": [10],
+ "any_arg_any": [False],
+ "all_arg_all": [True],
+ }
+ )
+
+ # any(False, None) = False, all(True, None) = True
+ for use_threads in [False, True]:
+ result = sliced.group_by("g", use_threads=use_threads).aggregate(
+ [
+ ("any_arg", "any"),
+ ("all_arg", "all"),
+ ]
+ )
+ assert result.equals(expected)
+
+
@pytest.mark.acero
def test_table_group_by_first():
# "first" is an ordered aggregation -> requires to specify
use_threads=False