Re: [PR] Optimize `from_bitwise_unary_op` for byte aligned case [arrow-rs]

via GitHub Sun, 08 Feb 2026 03:52:27 -0800


alamb commented on code in PR #9297:
URL: https://github.com/apache/arrow-rs/pull/9297#discussion_r2779168519



##########
arrow-buffer/src/buffer/boolean.rs:
##########
@@ -519,17 +494,14 @@ impl BitAnd<&BooleanBuffer> for &BooleanBuffer {
 
     fn bitand(self, rhs: &BooleanBuffer) -> Self::Output {
         assert_eq!(self.bit_len, rhs.bit_len);
-        BooleanBuffer {
-            buffer: buffer_bin_and(
-                &self.buffer,
-                self.bit_offset,
-                &rhs.buffer,
-                rhs.bit_offset,
-                self.bit_len,
-            ),
-            bit_offset: 0,
-            bit_len: self.bit_len,
-        }
+        BooleanBuffer::from_bitwise_binary_op(

Review Comment:
   This change seems unrelated to the improvements in bitwise binary op and is 
perhaps the source of the 50% reported slowdown of and?
   
   ```
   group            main                                   
optimize_from_bitwise_unary_op
   -----            ----                                   
------------------------------
   and              1.00    209.3±2.38ns        ? ?/sec    1.48    310.3±1.11ns 
       ? ?/sec
   ```



##########
arrow-buffer/src/buffer/boolean.rs:
##########
@@ -141,15 +139,15 @@ impl BooleanBuffer {
     /// input buffer.
     ///
     /// # Notes:
-    /// * The new `BooleanBuffer` has zero offset, even if `offset_in_bits` is 
non-zero
+    /// * The new `BooleanBuffer` keeps the non-zero offset if not aligned by 
byte and may have padding

Review Comment:
   I think there is at least one place that needs to be updated due to this
   
   
https://github.com/apache/arrow-rs/blob/96637fc8b928a94de53bbec3501337c0ecfbf936/arrow-buffer/src/buffer/ops.rs#L145-L162
   
   the "into_inner" call I think drops any offset (silently)
   



##########
arrow-buffer/src/buffer/boolean.rs:
##########
@@ -191,67 +189,48 @@ impl BooleanBuffer {
     where
         F: FnMut(u64) -> u64,
     {
-        // try fast path for aligned input
-        if offset_in_bits & 0x7 == 0 {
-            // align to byte boundary
-            let aligned = &src.as_ref()[offset_in_bits / 8..];
-            if let Some(result) =
-                Self::try_from_aligned_bitwise_unary_op(aligned, len_in_bits, 
&mut op)
-            {
-                return result;
+        let end = offset_in_bits + len_in_bits;
+        // Align start and end to 64 bit (8 byte) boundaries if possible to 
allow using the
+        // optimized code path as much as possible.
+        let aligned_offset = offset_in_bits & !63;
+        let aligned_end_bytes = bit_util::ceil(end, 64) * 8;
+        let src_len = src.as_ref().len();
+        let slice_end = aligned_end_bytes.min(src_len);
+
+        let aligned_start = &src.as_ref()[aligned_offset / 8..slice_end];
+
+        let (prefix, aligned_u64s, suffix) = unsafe { 
aligned_start.as_ref().align_to::<u64>() };
+        match (prefix, suffix) {
+            ([], []) => {
+                // the buffer is word (64 bit) aligned, so use optimized Vec 
code.
+                let result_u64s: Vec<u64> = aligned_u64s.iter().map(|l| 
op(*l)).collect();
+                return BooleanBuffer::new(result_u64s.into(), offset_in_bits % 
64, len_in_bits);
             }
-        }
-
-        let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits);
-        let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8);
-        for chunk in chunks.iter() {
-            // SAFETY: reserved enough capacity above, (exactly num_u64s()
-            // items) and we assume `BitChunks` correctly reports upper bound
-            unsafe {
-                result.push_unchecked(op(chunk));
+            ([], suffix) => {
+                let suffix = read_u64(suffix);
+                let result_u64s: Vec<u64> = aligned_u64s
+                    .iter()
+                    .cloned()
+                    .chain(std::iter::once(suffix))
+                    .map(&mut op)
+                    .collect();
+                return BooleanBuffer::new(result_u64s.into(), offset_in_bits % 
64, len_in_bits);
             }
-        }
-        if chunks.remainder_len() > 0 {
-            debug_assert!(result.capacity() >= result.len() + 8); // should 
not reallocate
-            // SAFETY: reserved enough capacity above, (exactly num_u64s()
-            // items) and we assume `BitChunks` correctly reports upper bound
-            unsafe {
-                result.push_unchecked(op(chunks.remainder_bits()));
-            }
-            // Just pushed one u64, which may have trailing zeros
-            result.truncate(chunks.num_bytes());
+            _ => {}
         }
 
-        BooleanBuffer {
-            buffer: Buffer::from(result),
-            bit_offset: 0,
-            bit_len: len_in_bits,
-        }
-    }
+        // align to byte boundaries
+        // Use unaligned code path, handle remainder bytes
+        let chunks = aligned_start.chunks_exact(8);
+        let remainder = chunks.remainder();
+        let iter = chunks.map(|c| u64::from_le_bytes(c.try_into().unwrap()));
+        let vec_u64s: Vec<u64> = if remainder.is_empty() {
+            iter.map(&mut op).collect()
+        } else {
+            iter.chain(Some(read_u64(remainder))).map(&mut op).collect()
+        };
 
-    /// Fast path for [`Self::from_bitwise_unary_op`] when input is aligned to
-    /// 8-byte (64-bit) boundaries
-    ///
-    /// Returns None if the fast path cannot be taken
-    fn try_from_aligned_bitwise_unary_op<F>(
-        src: &[u8],
-        len_in_bits: usize,
-        op: &mut F,
-    ) -> Option<Self>
-    where
-        F: FnMut(u64) -> u64,
-    {
-        // Safety: all valid bytes are valid u64s
-        let (prefix, aligned_u6us, suffix) = unsafe { src.align_to::<u64>() };
-        if !(prefix.is_empty() && suffix.is_empty()) {
-            // Couldn't make this case any faster than the default path, see
-            // https://github.com/apache/arrow-rs/pull/8996/changes#r2620022082
-            return None;
-        }
-        // the buffer is word (64 bit) aligned, so use optimized Vec code.
-        let result_u64s: Vec<u64> = aligned_u6us.iter().map(|l| 
op(*l)).collect();
-        let buffer = Buffer::from(result_u64s);
-        Some(BooleanBuffer::new(buffer, 0, len_in_bits))
+        BooleanBuffer::new(vec_u64s.into(), offset_in_bits % 64, len_in_bits)

Review Comment:
   This is a key difference in the two approaches -- the current code on main 
will produce an output buffer that is aligned (offset is 0), but this code will 
produce an output buffer that is not aligned (same as the input)
   
   That is probably why the benchmark results can be so much better in this 
case -- because the output is different (though still correct)
   
   This is probably ok, but I wanted to point it out as a potential side effect



##########
arrow-buffer/src/buffer/boolean.rs:
##########
@@ -191,67 +189,48 @@ impl BooleanBuffer {
     where
         F: FnMut(u64) -> u64,
     {
-        // try fast path for aligned input
-        if offset_in_bits & 0x7 == 0 {
-            // align to byte boundary
-            let aligned = &src.as_ref()[offset_in_bits / 8..];
-            if let Some(result) =
-                Self::try_from_aligned_bitwise_unary_op(aligned, len_in_bits, 
&mut op)
-            {
-                return result;
+        let end = offset_in_bits + len_in_bits;
+        // Align start and end to 64 bit (8 byte) boundaries if possible to 
allow using the
+        // optimized code path as much as possible.
+        let aligned_offset = offset_in_bits & !63;
+        let aligned_end_bytes = bit_util::ceil(end, 64) * 8;
+        let src_len = src.as_ref().len();
+        let slice_end = aligned_end_bytes.min(src_len);
+
+        let aligned_start = &src.as_ref()[aligned_offset / 8..slice_end];
+
+        let (prefix, aligned_u64s, suffix) = unsafe { 
aligned_start.as_ref().align_to::<u64>() };
+        match (prefix, suffix) {
+            ([], []) => {
+                // the buffer is word (64 bit) aligned, so use optimized Vec 
code.
+                let result_u64s: Vec<u64> = aligned_u64s.iter().map(|l| 
op(*l)).collect();
+                return BooleanBuffer::new(result_u64s.into(), offset_in_bits % 
64, len_in_bits);
             }
-        }
-
-        let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits);
-        let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8);
-        for chunk in chunks.iter() {
-            // SAFETY: reserved enough capacity above, (exactly num_u64s()
-            // items) and we assume `BitChunks` correctly reports upper bound
-            unsafe {
-                result.push_unchecked(op(chunk));
+            ([], suffix) => {
+                let suffix = read_u64(suffix);
+                let result_u64s: Vec<u64> = aligned_u64s
+                    .iter()
+                    .cloned()
+                    .chain(std::iter::once(suffix))
+                    .map(&mut op)
+                    .collect();
+                return BooleanBuffer::new(result_u64s.into(), offset_in_bits % 
64, len_in_bits);
             }
-        }
-        if chunks.remainder_len() > 0 {
-            debug_assert!(result.capacity() >= result.len() + 8); // should 
not reallocate
-            // SAFETY: reserved enough capacity above, (exactly num_u64s()
-            // items) and we assume `BitChunks` correctly reports upper bound
-            unsafe {
-                result.push_unchecked(op(chunks.remainder_bits()));
-            }
-            // Just pushed one u64, which may have trailing zeros
-            result.truncate(chunks.num_bytes());
+            _ => {}
         }
 
-        BooleanBuffer {
-            buffer: Buffer::from(result),
-            bit_offset: 0,
-            bit_len: len_in_bits,
-        }
-    }
+        // align to byte boundaries
+        // Use unaligned code path, handle remainder bytes
+        let chunks = aligned_start.chunks_exact(8);
+        let remainder = chunks.remainder();
+        let iter = chunks.map(|c| u64::from_le_bytes(c.try_into().unwrap()));
+        let vec_u64s: Vec<u64> = if remainder.is_empty() {
+            iter.map(&mut op).collect()

Review Comment:
   in theory the remainder should never be empty right? Otherwise the aligned 
path above would be hit



##########
arrow-buffer/src/buffer/immutable.rs:
##########
@@ -343,7 +343,17 @@ impl Buffer {
             return self.slice_with_length(offset / 8, bit_util::ceil(len, 8));
         }
 
-        BooleanBuffer::from_bits(self.as_slice(), offset, len).into_inner()
+        let chunks = self.bit_chunks(offset, len);

Review Comment:
   This change also seems unrelated -- perhaps we can pull it into its own PR



##########
arrow-buffer/src/buffer/boolean.rs:
##########
@@ -191,67 +189,48 @@ impl BooleanBuffer {
     where
         F: FnMut(u64) -> u64,
     {
-        // try fast path for aligned input
-        if offset_in_bits & 0x7 == 0 {
-            // align to byte boundary
-            let aligned = &src.as_ref()[offset_in_bits / 8..];
-            if let Some(result) =
-                Self::try_from_aligned_bitwise_unary_op(aligned, len_in_bits, 
&mut op)
-            {
-                return result;
+        let end = offset_in_bits + len_in_bits;
+        // Align start and end to 64 bit (8 byte) boundaries if possible to 
allow using the
+        // optimized code path as much as possible.
+        let aligned_offset = offset_in_bits & !63;
+        let aligned_end_bytes = bit_util::ceil(end, 64) * 8;
+        let src_len = src.as_ref().len();
+        let slice_end = aligned_end_bytes.min(src_len);
+
+        let aligned_start = &src.as_ref()[aligned_offset / 8..slice_end];
+
+        let (prefix, aligned_u64s, suffix) = unsafe { 
aligned_start.as_ref().align_to::<u64>() };
+        match (prefix, suffix) {
+            ([], []) => {
+                // the buffer is word (64 bit) aligned, so use optimized Vec 
code.
+                let result_u64s: Vec<u64> = aligned_u64s.iter().map(|l| 
op(*l)).collect();
+                return BooleanBuffer::new(result_u64s.into(), offset_in_bits % 
64, len_in_bits);
             }
-        }
-
-        let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits);
-        let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8);
-        for chunk in chunks.iter() {
-            // SAFETY: reserved enough capacity above, (exactly num_u64s()
-            // items) and we assume `BitChunks` correctly reports upper bound
-            unsafe {
-                result.push_unchecked(op(chunk));
+            ([], suffix) => {
+                let suffix = read_u64(suffix);
+                let result_u64s: Vec<u64> = aligned_u64s
+                    .iter()
+                    .cloned()
+                    .chain(std::iter::once(suffix))
+                    .map(&mut op)
+                    .collect();
+                return BooleanBuffer::new(result_u64s.into(), offset_in_bits % 
64, len_in_bits);
             }
-        }
-        if chunks.remainder_len() > 0 {
-            debug_assert!(result.capacity() >= result.len() + 8); // should 
not reallocate
-            // SAFETY: reserved enough capacity above, (exactly num_u64s()
-            // items) and we assume `BitChunks` correctly reports upper bound
-            unsafe {
-                result.push_unchecked(op(chunks.remainder_bits()));
-            }
-            // Just pushed one u64, which may have trailing zeros
-            result.truncate(chunks.num_bytes());
+            _ => {}
         }
 
-        BooleanBuffer {
-            buffer: Buffer::from(result),
-            bit_offset: 0,
-            bit_len: len_in_bits,
-        }
-    }
+        // align to byte boundaries

Review Comment:
   This codepath appears untested by unit tests
   
   ```rust
   cargo llvm-cov --html test -p arrow-buffer
   ```
   
   <img width="954" height="799" alt="Image" 
src="https://github.com/user-attachments/assets/078144ae-5c3d-4d3e-987b-312ff8f0f5db";
 />



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Optimize `from_bitwise_unary_op` for byte aligned case [arrow-rs]

Reply via email to