alamb commented on code in PR #9137:
URL: https://github.com/apache/arrow-rs/pull/9137#discussion_r2679461906


##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };

Review Comment:
   I almost wonder if it would make sense to put this call to `self.reserve` in 
`MutableBuffer::set_len` 



##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };
+        }
+        self.len = new_len;
+    }
+
+    /// Extends this builder with boolean values.
+    ///
+    /// This requires `iter` to report an exact size via `size_hint`.
+    ///
+    /// # Safety
+    /// Callers must ensure that `iter` reports an exact size via `size_hint`.
+    #[inline]
+    pub unsafe fn extend_bool_trusted_len<I: Iterator<Item = bool>>(
+        &mut self,
+        iter: I,
+        offset: usize,
+    ) {
+        let (lower, upper) = iter.size_hint();
+        let len = upper.expect("Iterator must have exact size_hint");
+        assert_eq!(lower, len, "Iterator must have exact size_hint");
+
+        if len == 0 {
+            return;
+        }
+
+        let start_len = offset;
+        let end_bit = start_len + len;
+
+        // SAFETY: we will initialize all newly exposed bytes before they are 
read
+        unsafe { self.advance_uninit(len) };
+        let slice = self.as_slice_mut();
+
+        let mut iter = iter;

Review Comment:
   nit you could avoid this by just declaring the parameter `mut` I think 
   
   ```diff
   --- a/arrow-buffer/src/buffer/mutable.rs
   +++ b/arrow-buffer/src/buffer/mutable.rs
   @@ -648,7 +648,7 @@ impl MutableBuffer {
        #[inline]
        pub unsafe fn extend_bool_trusted_len<I: Iterator<Item = bool>>(
            &mut self,
   -        iter: I,
   +        mut iter: I,
            offset: usize,
        ) {
            let (lower, upper) = iter.size_hint();
   ```



##########
arrow-buffer/src/builder/boolean.rs:
##########
@@ -259,6 +259,20 @@ impl BooleanBufferBuilder {
     pub fn finish_cloned(&self) -> BooleanBuffer {
         BooleanBuffer::new(Buffer::from_slice_ref(self.as_slice()), 0, 
self.len)
     }
+
+    /// Extends the builder from a trusted length iterator of booleans.
+    /// # Safety
+    /// Callers must ensure that `iter` reports an exact size via `size_hint`.
+    ///
+    #[inline]
+    pub unsafe fn extend_trusted_len<I>(&mut self, iterator: I)
+    where
+        I: Iterator<Item = bool>,
+    {
+        let len = iterator.size_hint().0;
+        unsafe { self.buffer.extend_bool_trusted_len(iterator, self.len) };

Review Comment:
   It might make it easier to understand the correctness if 
`extend_bool_trusted_len` returned the number of bits that were appended rather 
than having to get it from the iterator



##########
arrow-buffer/src/builder/boolean.rs:
##########
@@ -526,4 +540,65 @@ mod tests {
         assert_eq!(buf.len(), buf2.inner().len());
         assert_eq!(buf.as_slice(), buf2.values());
     }
+
+    #[test]
+    fn test_extend() {

Review Comment:
   I recommend adding some explicit tests for the mutable buffer API directly 
as well.



##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };
+        }
+        self.len = new_len;
+    }
+
+    /// Extends this builder with boolean values.
+    ///
+    /// This requires `iter` to report an exact size via `size_hint`.
+    ///
+    /// # Safety
+    /// Callers must ensure that `iter` reports an exact size via `size_hint`.
+    #[inline]
+    pub unsafe fn extend_bool_trusted_len<I: Iterator<Item = bool>>(
+        &mut self,
+        iter: I,
+        offset: usize,
+    ) {
+        let (lower, upper) = iter.size_hint();
+        let len = upper.expect("Iterator must have exact size_hint");
+        assert_eq!(lower, len, "Iterator must have exact size_hint");
+
+        if len == 0 {
+            return;
+        }
+
+        let start_len = offset;
+        let end_bit = start_len + len;
+
+        // SAFETY: we will initialize all newly exposed bytes before they are 
read
+        unsafe { self.advance_uninit(len) };
+        let slice = self.as_slice_mut();
+
+        let mut iter = iter;
+        let mut bit_idx = start_len;
+
+        // ---- Unaligned prefix: advance to the next 64-bit boundary ----
+        let misalignment = bit_idx & 63;
+        let prefix_bits = if misalignment == 0 {
+            0
+        } else {
+            (64 - misalignment).min(end_bit - bit_idx)
+        };
+
+        if prefix_bits != 0 {
+            let byte_start = bit_idx / 8;
+            let byte_end = bit_util::ceil(bit_idx + prefix_bits, 8);
+            let bit_offset = bit_idx % 8;
+
+            // Clear any newly-visible bits in the existing partial byte
+            if bit_offset != 0 {
+                let keep_mask = (1u8 << bit_offset).wrapping_sub(1);
+                slice[byte_start] &= keep_mask;
+            }
+
+            // Zero any new bytes we will partially fill in this prefix
+            let zero_from = if bit_offset == 0 {
+                byte_start
+            } else {
+                byte_start + 1
+            };
+            if byte_end > zero_from {
+                slice[zero_from..byte_end].fill(0);
+            }
+
+            for _ in 0..prefix_bits {
+                let v = iter.next().unwrap();
+                if v {
+                    let byte_idx = bit_idx / 8;
+                    let bit = bit_idx % 8;
+                    slice[byte_idx] |= 1 << bit;
+                }
+                bit_idx += 1;
+            }
+        }
+
+        if bit_idx < end_bit {
+            // ---- Aligned middle: write u64 chunks ----
+            debug_assert_eq!(bit_idx & 63, 0);
+            let remaining_bits = end_bit - bit_idx;
+            let chunks = remaining_bits / 64;
+
+            let words_start = bit_idx / 8;
+            let words_end = words_start + chunks * 8;
+            for dst in slice[words_start..words_end].chunks_exact_mut(8) {

Review Comment:
   Once we have MSRV 1.88 we could also use as_chunks_mut and the compiler will 
know exactly how long the slice is:
   
   ```rust
               let (chunks, _remain) = 
slice[words_start..words_end].as_chunks_mut::<8>();
               for dst in chunks {
   ```
   
   However, it isn't quite ready
   ```
   error: current MSRV (Minimum Supported Rust Version) is `1.85.0` but this 
item is stable since `1.88.0`
      --> arrow-buffer/src/buffer/mutable.rs:720:67
       |
   720 |             let (chunks, _remain) = 
slice[words_start..words_end].as_chunks_mut::<8>();
       |                                                                   
^^^^^^^^^^^^^^^^^^^^
   ```



##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };
+        }
+        self.len = new_len;
+    }
+
+    /// Extends this builder with boolean values.

Review Comment:
   I think we should also document that all bits are set to zero outside the 
bit range 
   
   (also the text says builder when this is a *buffer*)



##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };
+        }
+        self.len = new_len;
+    }
+
+    /// Extends this builder with boolean values.
+    ///
+    /// This requires `iter` to report an exact size via `size_hint`.

Review Comment:
   We need to document that `offset` is an offset in bits and must be < than 
the current length. It might also be good to put an assert in that 
dif_ceil(offset,8) is less than the length



##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };
+        }
+        self.len = new_len;

Review Comment:
   I think this is a bug (found by chatgpt codex) -- `addtional` is measured in 
`bits` but `self.len` is measured in bytes.  Here is a proposal to fix:
   - https://github.com/Dandandan/arrow-rs/pull/9
   



##########
arrow-buffer/src/buffer/mutable.rs:
##########
@@ -623,6 +623,139 @@ impl MutableBuffer {
         buffer
     }
 
+    /// Advances the buffer by `additional` bits without initializing the new 
bytes.
+    ///
+    /// # Safety
+    /// Callers must ensure that all newly added bits are written before the 
buffer is read.
+    #[inline]
+    unsafe fn advance_uninit(&mut self, additional: usize) {
+        let new_len = self.len + additional;
+        let new_len_bytes = bit_util::ceil(new_len, 8);
+        if new_len_bytes > self.len() {
+            self.reserve(new_len_bytes - self.len());
+            // SAFETY: caller will initialize all newly exposed bytes
+            unsafe { self.set_len(new_len_bytes) };
+        }
+        self.len = new_len;
+    }
+
+    /// Extends this builder with boolean values.
+    ///
+    /// This requires `iter` to report an exact size via `size_hint`.
+    ///
+    /// # Safety
+    /// Callers must ensure that `iter` reports an exact size via `size_hint`.
+    #[inline]
+    pub unsafe fn extend_bool_trusted_len<I: Iterator<Item = bool>>(
+        &mut self,
+        iter: I,
+        offset: usize,
+    ) {
+        let (lower, upper) = iter.size_hint();
+        let len = upper.expect("Iterator must have exact size_hint");
+        assert_eq!(lower, len, "Iterator must have exact size_hint");
+
+        if len == 0 {

Review Comment:
   Technically speaking there is no test coverage for this branch
   
   <img width="740" height="317" alt="Image" 
src="https://github.com/user-attachments/assets/de589e4c-ffff-41ef-b1c8-53ada187ba63";
 />



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to