This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 39c8814f04 feat: extract `has_false` and `has_true` from BooleanArray 
to `BooleanBuffer` and reuse for no nulls (#9987)
39c8814f04 is described below

commit 39c8814f04150f4b8f1c3b30d7914022e7c2a292
Author: Raz Luvaton <[email protected]>
AuthorDate: Wed May 20 21:44:42 2026 +0300

    feat: extract `has_false` and `has_true` from BooleanArray to 
`BooleanBuffer` and reuse for no nulls (#9987)
    
    # Which issue does this PR close?
    
    N/A
    
    # Rationale for this change
    
    So we can use the useful helpers without creating temp `BooleanArray`
    
    # What changes are included in this PR?
    
    Extract non null variants for computing `has_false` and `has_true` from
    `BooleanArray` to `BooleanBuffer` and call them instead, and copied the
    tests for not nullable
    
    # Are these changes tested?
    Yes
    
    # Are there any user-facing changes?
    2 new functions
    
    
    -----
    
    Cc @alamb as talked in:
    - https://github.com/apache/datafusion/pull/22158#discussion_r3241321436
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/src/array/boolean_array.rs |  52 +----------
 arrow-buffer/src/buffer/boolean.rs     | 165 ++++++++++++++++++++++++++++++++-
 2 files changed, 166 insertions(+), 51 deletions(-)

diff --git a/arrow-array/src/array/boolean_array.rs 
b/arrow-array/src/array/boolean_array.rs
index 22a1ba7653..0fa5b5d912 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -19,7 +19,6 @@ use crate::array::print_long_array;
 use crate::builder::BooleanBuilder;
 use crate::iterator::BooleanIter;
 use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
-use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk;
 use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, bit_util};
 use arrow_data::{ArrayData, ArrayDataBuilder};
 use arrow_schema::DataType;
@@ -157,16 +156,6 @@ impl BooleanArray {
         &self.values
     }
 
-    /// Block size for chunked fold operations in [`Self::has_true`] and 
[`Self::has_false`].
-    /// Using `chunks_exact` with this size lets the compiler fully unroll the 
inner
-    /// fold (no inner branch/loop), enabling short-circuit exits every N 
chunks.
-    const CHUNK_FOLD_BLOCK_SIZE: usize = 16;
-
-    /// Returns an [`UnalignedBitChunk`] over this array's values.
-    fn unaligned_bit_chunks(&self) -> UnalignedBitChunk<'_> {
-        UnalignedBitChunk::new(self.values().values(), self.values().offset(), 
self.len())
-    }
-
     /// Returns the number of non null, true values within this array.
     /// If you only need to check if there is at least one true value, 
consider using `has_true()` which can short-circuit and be more efficient.
     pub fn true_count(&self) -> usize {
@@ -202,16 +191,7 @@ impl BooleanArray {
                 let value_chunks = self.values().bit_chunks().iter_padded();
                 null_chunks.zip(value_chunks).any(|(n, v)| (n & v) != 0)
             }
-            None => {
-                let bit_chunks = self.unaligned_bit_chunks();
-                let chunks = bit_chunks.chunks();
-                let mut exact = 
chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
-                let found = bit_chunks.prefix().unwrap_or(0) != 0
-                    || exact.any(|block| block.iter().fold(0u64, |acc, &c| acc 
| c) != 0);
-                found
-                    || exact.remainder().iter().any(|&c| c != 0)
-                    || bit_chunks.suffix().unwrap_or(0) != 0
-            }
+            None => self.values().has_true(),
         }
     }
 
@@ -228,35 +208,7 @@ impl BooleanArray {
                 let value_chunks = self.values().bit_chunks().iter_padded();
                 null_chunks.zip(value_chunks).any(|(n, v)| (n & !v) != 0)
             }
-            None => {
-                let bit_chunks = self.unaligned_bit_chunks();
-                // UnalignedBitChunk zeros padding bits; fill them with 1s so
-                // they don't appear as false values.
-                let lead_mask = !((1u64 << bit_chunks.lead_padding()) - 1);
-                let trail_mask = if bit_chunks.trailing_padding() == 0 {
-                    u64::MAX
-                } else {
-                    (1u64 << (64 - bit_chunks.trailing_padding())) - 1
-                };
-                let (prefix_fill, suffix_fill) = match (bit_chunks.prefix(), 
bit_chunks.suffix()) {
-                    (Some(_), Some(_)) => (!lead_mask, !trail_mask),
-                    (Some(_), None) => (!lead_mask | !trail_mask, 0),
-                    (None, Some(_)) => (0, !trail_mask),
-                    (None, None) => (0, 0),
-                };
-                let chunks = bit_chunks.chunks();
-                let mut exact = 
chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
-                let found = bit_chunks
-                    .prefix()
-                    .is_some_and(|v| (v | prefix_fill) != u64::MAX)
-                    || exact
-                        .any(|block| block.iter().fold(u64::MAX, |acc, &c| acc 
& c) != u64::MAX);
-                found
-                    || exact.remainder().iter().any(|&c| c != u64::MAX)
-                    || bit_chunks
-                        .suffix()
-                        .is_some_and(|v| (v | suffix_fill) != u64::MAX)
-            }
+            None => self.values().has_false(),
         }
     }
 
diff --git a/arrow-buffer/src/buffer/boolean.rs 
b/arrow-buffer/src/buffer/boolean.rs
index 420bbf59f3..52df67080a 100644
--- a/arrow-buffer/src/buffer/boolean.rs
+++ b/arrow-buffer/src/buffer/boolean.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::bit_chunk_iterator::BitChunks;
+use crate::bit_chunk_iterator::{BitChunks, UnalignedBitChunk};
 use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, 
BitSliceIterator};
 use crate::bit_util::read_u64;
 use crate::{
@@ -611,6 +611,11 @@ impl BooleanBuffer {
         self.into_iter()
     }
 
+    /// Returns an [`UnalignedBitChunk`] over this buffer's values.
+    fn unaligned_bit_chunks(&self) -> UnalignedBitChunk<'_> {
+        UnalignedBitChunk::new(self.values(), self.offset(), self.len())
+    }
+
     /// Returns an iterator over the set bit positions in this 
[`BooleanBuffer`]
     pub fn set_indices(&self) -> BitIndexIterator<'_> {
         BitIndexIterator::new(self.values(), self.bit_offset, self.bit_len)
@@ -625,6 +630,61 @@ impl BooleanBuffer {
     pub fn set_slices(&self) -> BitSliceIterator<'_> {
         BitSliceIterator::new(self.values(), self.bit_offset, self.bit_len)
     }
+
+    /// Block size for chunked fold operations in [`Self::has_true`] and 
[`Self::has_false`].
+    /// Using `chunks_exact` with this size lets the compiler fully unroll the 
inner
+    /// fold (no inner branch/loop), enabling short-circuit exits every N 
chunks.
+    const CHUNK_FOLD_BLOCK_SIZE: usize = 16;
+
+    /// Returns whether there is at least one `true` value in this buffer.
+    ///
+    /// This is more efficient than `count_set_bits() > 0` because it can 
short-circuit
+    /// as soon as a `true` value is found, without counting all set bits.
+    ///
+    /// Returns `false` for empty buffer.
+    pub fn has_true(&self) -> bool {
+        let bit_chunks = self.unaligned_bit_chunks();
+        let chunks = bit_chunks.chunks();
+        let mut exact = chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
+        let found = bit_chunks.prefix().unwrap_or(0) != 0
+            || exact.any(|block| block.iter().fold(0u64, |acc, &c| acc | c) != 
0);
+        found || exact.remainder().iter().any(|&c| c != 0) || 
bit_chunks.suffix().unwrap_or(0) != 0
+    }
+
+    /// Returns whether there is at least one `false` value in this buffer.
+    ///
+    /// This is more efficient than `len() > count_set_bits()` because it can 
short-circuit
+    /// as soon as a `false` value is found, without counting all set bits.
+    ///
+    /// Returns `false` for empty buffer.
+    pub fn has_false(&self) -> bool {
+        let bit_chunks = self.unaligned_bit_chunks();
+        // UnalignedBitChunk zeros padding bits; fill them with 1s so
+        // they don't appear as false values.
+        let lead_mask = !((1u64 << bit_chunks.lead_padding()) - 1);
+        let trail_mask = if bit_chunks.trailing_padding() == 0 {
+            u64::MAX
+        } else {
+            (1u64 << (64 - bit_chunks.trailing_padding())) - 1
+        };
+        let (prefix_fill, suffix_fill) = match (bit_chunks.prefix(), 
bit_chunks.suffix()) {
+            (Some(_), Some(_)) => (!lead_mask, !trail_mask),
+            (Some(_), None) => (!lead_mask | !trail_mask, 0),
+            (None, Some(_)) => (0, !trail_mask),
+            (None, None) => (0, 0),
+        };
+        let chunks = bit_chunks.chunks();
+        let mut exact = chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
+        let found = bit_chunks
+            .prefix()
+            .is_some_and(|v| (v | prefix_fill) != u64::MAX)
+            || exact.any(|block| block.iter().fold(u64::MAX, |acc, &c| acc & 
c) != u64::MAX);
+        found
+            || exact.remainder().iter().any(|&c| c != u64::MAX)
+            || bit_chunks
+                .suffix()
+                .is_some_and(|v| (v | suffix_fill) != u64::MAX)
+    }
 }
 
 impl Not for &BooleanBuffer {
@@ -1245,4 +1305,107 @@ mod tests {
         let buffer = BooleanBuffer::new_unset(100);
         assert_eq!(buffer.clone().find_nth_set_bit_position(0, 1), 100);
     }
+
+    #[test]
+    fn test_has_true_has_false_all_true() {
+        let arr = BooleanBuffer::from(vec![true, true, true]);
+        assert!(arr.has_true());
+        assert!(!arr.has_false());
+    }
+
+    #[test]
+    fn test_has_true_has_false_all_false() {
+        let arr = BooleanBuffer::from(vec![false, false, false]);
+        assert!(!arr.has_true());
+        assert!(arr.has_false());
+    }
+
+    #[test]
+    fn test_has_true_has_false_mixed() {
+        let arr = BooleanBuffer::from(vec![true, false, true]);
+        assert!(arr.has_true());
+        assert!(arr.has_false());
+    }
+
+    #[test]
+    fn test_has_true_has_false_empty() {
+        let arr = BooleanBuffer::from(Vec::<bool>::new());
+        assert!(!arr.has_true());
+        assert!(!arr.has_false());
+    }
+
+    #[test]
+    fn test_has_false_aligned_suffix_all_true() {
+        let arr = BooleanBuffer::from(vec![true; 129]);
+        assert!(arr.has_true());
+        assert!(!arr.has_false());
+    }
+
+    #[test]
+    fn test_has_false_non_aligned_all_true() {
+        // 65 elements: exercises the remainder path in has_false
+        let arr = BooleanBuffer::from(vec![true; 65]);
+        assert!(arr.has_true());
+        assert!(!arr.has_false());
+    }
+
+    #[test]
+    fn test_has_false_non_aligned_last_false() {
+        // 64 trues + 1 false: remainder path should find the false
+        let mut values = vec![true; 64];
+        values.push(false);
+        let arr = BooleanBuffer::from(values);
+        assert!(arr.has_true());
+        assert!(arr.has_false());
+    }
+
+    #[test]
+    fn test_has_false_exact_64_all_true() {
+        // Exactly 64 elements, no remainder
+        let arr = BooleanBuffer::from(vec![true; 64]);
+        assert!(arr.has_true());
+        assert!(!arr.has_false());
+    }
+
+    #[test]
+    fn test_has_true_has_false_unaligned_slices() {
+        let cases = [
+            (1, 129, true, false),
+            (3, 130, true, false),
+            (5, 65, true, false),
+            (7, 64, true, false),
+        ];
+
+        let base = BooleanBuffer::from(vec![true; 300]);
+
+        for (offset, len, expected_has_true, expected_has_false) in cases {
+            let arr = base.slice(offset, len);
+            assert_eq!(
+                arr.has_true(),
+                expected_has_true,
+                "offset={offset} len={len}"
+            );
+            assert_eq!(
+                arr.has_false(),
+                expected_has_false,
+                "offset={offset} len={len}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_has_true_has_false_exact_multiples_of_64() {
+        let cases = [
+            (64, true, false),
+            (128, true, false),
+            (192, true, false),
+            (256, true, false),
+        ];
+
+        for (len, expected_has_true, expected_has_false) in cases {
+            let arr = BooleanBuffer::from(vec![true; len]);
+            assert_eq!(arr.has_true(), expected_has_true, "len={len}");
+            assert_eq!(arr.has_false(), expected_has_false, "len={len}");
+        }
+    }
 }

Reply via email to