(arrow) branch main updated: GH-50043: [C++][Python] Fix hash_any/hash_all on sliced boolean arrays (#50094)

rok Thu, 04 Jun 2026 07:03:12 -0700

This is an automated email from the ASF dual-hosted git repository.

rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 95cb5d0845 GH-50043: [C++][Python] Fix hash_any/hash_all on sliced 
boolean arrays (#50094)
95cb5d0845 is described below

commit 95cb5d084506f97a16df006053642e9690ab731d
Author: fenfeng9 <[email protected]>
AuthorDate: Thu Jun 4 22:02:52 2026 +0800

    GH-50043: [C++][Python] Fix hash_any/hash_all on sliced boolean arrays 
(#50094)
    
    ### Rationale for this change
    
    `hash_any` and `hash_all` could return incorrect results for sliced 
nullable boolean arrays.
    
    The validity bitmap used the slice offset, but the boolean values bitmap 
did not.
    
    ### What changes are included in this PR?
    
    Apply the slice offset when reading boolean values in `hash_any` / 
`hash_all`.
    
    Add C++ and Python regression tests.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #50043
    
    Authored-by: fenfeng9 <[email protected]>
    Signed-off-by: Rok Mihevc <[email protected]>
---
 cpp/src/arrow/acero/hash_aggregate_test.cc      | 38 +++++++++++++++++++++++++
 cpp/src/arrow/compute/kernels/hash_aggregate.cc |  3 +-
 python/pyarrow/tests/test_table.py              | 32 +++++++++++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc 
b/cpp/src/arrow/acero/hash_aggregate_test.cc
index 12d24429cb..442dcd5883 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -2157,6 +2157,44 @@ TEST_P(GroupBy, AnyAndAll) {
   }
 }
 
+TEST_P(GroupBy, AnyAllSlicedNullableBoolean) {
+  auto table = TableFromJSON(schema({field("any_arg", boolean()),
+                                     field("all_arg", boolean()), field("key", 
int64())}),
+                             {R"([
+    [true,  false, 99],
+    [false, true,  10],
+    [null,  null,  10]
+  ])"});
+  auto sliced = table->Slice(1);
+
+  // GH-50043: hash_any/hash_all should respect the slice offset.
+  // After Slice(1), any_arg=[false, null] and all_arg=[true, null].
+  auto expected = ArrayFromJSON(struct_({
+                                    field("key_0", int64()),
+                                    field("hash_any", boolean()),
+                                    field("hash_all", boolean()),
+                                }),
+                                R"([
+      [10, false, true]
+    ])");
+
+  for (bool use_threads : {true, false}) {
+    SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+
+    ASSERT_OK_AND_ASSIGN(auto actual, 
GroupByTest({sliced->GetColumnByName("any_arg"),
+                                                   
sliced->GetColumnByName("all_arg")},
+                                                  
{sliced->GetColumnByName("key")},
+                                                  {
+                                                      {"hash_any", nullptr},
+                                                      {"hash_all", nullptr},
+                                                  },
+                                                  use_threads));
+    ValidateOutput(actual);
+
+    AssertDatumsEqual(expected, actual, /*verbose=*/true);
+  }
+}
+
 TEST_P(GroupBy, AnyAllScalar) {
   BatchesWithSchema input;
   input.batches = {
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc 
b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 8359945319..d07096236d 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1261,7 +1261,8 @@ struct GroupedBooleanAggregator : public 
GroupedAggregator {
             input.buffers[0].data, input.offset, input.length,
             [&](int64_t position) {
               counts[*g]++;
-              Impl::UpdateGroupWith(reduced, *g, bit_util::GetBit(bitmap, 
position));
+              Impl::UpdateGroupWith(reduced, *g,
+                                    bit_util::GetBit(bitmap, input.offset + 
position));
               g++;
             },
             [&] { bit_util::SetBitTo(no_nulls, *g++, false); });
diff --git a/python/pyarrow/tests/test_table.py 
b/python/pyarrow/tests/test_table.py
index b65fb7d952..c6dbbc5145 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -2976,6 +2976,38 @@ def test_table_group_by():
     }
 
 
[email protected]
+def test_group_by_sliced_any_all():
+    # GH-50043: hash_any/hash_all produce incorrect results on sliced boolean 
arrays
+    # Row 0 will be discarded by slice, should not affect aggregation
+    table = pa.table(
+        {
+            "g": [99, 10, 10],
+            "any_arg": [True, False, None],
+            "all_arg": [False, True, None],
+        }
+    )
+    sliced = table.slice(1)
+
+    expected = pa.table(
+        {
+            "g": [10],
+            "any_arg_any": [False],
+            "all_arg_all": [True],
+        }
+    )
+
+    # any(False, None) = False, all(True, None) = True
+    for use_threads in [False, True]:
+        result = sliced.group_by("g", use_threads=use_threads).aggregate(
+            [
+                ("any_arg", "any"),
+                ("all_arg", "all"),
+            ]
+        )
+        assert result.equals(expected)
+
+
 @pytest.mark.acero
 def test_table_group_by_first():
     # "first" is an ordered aggregation -> requires to specify 
use_threads=False

(arrow) branch main updated: GH-50043: [C++][Python] Fix hash_any/hash_all on sliced boolean arrays (#50094)

Reply via email to