scovich commented on code in PR #7987:
URL: https://github.com/apache/arrow-rs/pull/7987#discussion_r2229500164


##########
parquet-variant/src/builder.rs:
##########
@@ -1115,22 +1086,75 @@ impl VariantBuilder {
     }
 }
 
+/// An iterator that yields the bytes of a packed u32 iterator.
+/// Will yield the first `packed_bytes` bytes of each item in the iterator.
+struct PackedU32Iterator<T: Iterator<Item = [u8; 4]>> {
+    packed_bytes: usize,
+    iterator: T,
+    current_item: [u8; 4],
+    current_byte: usize, // 0..3
+}
+
+impl<T: Iterator<Item = [u8; 4]>> PackedU32Iterator<T> {
+    fn new(packed_bytes: usize, iterator: T) -> Self {
+        // eliminate corner cases in `next` by initializing with a fake 
already-consumed "first" item
+        Self {
+            packed_bytes,
+            iterator,
+            current_item: [0; 4],
+            current_byte: packed_bytes,
+        }
+    }
+}
+
+impl<T: Iterator<Item = [u8; 4]>> Iterator for PackedU32Iterator<T> {
+    type Item = u8;
+
+    fn next(&mut self) -> Option<u8> {
+        if self.current_byte >= self.packed_bytes {
+            let next_item = self.iterator.next()?;
+            self.current_item = next_item;

Review Comment:
   Split into two statements due to lifetime issues, I suppose?



##########
parquet-variant/src/builder.rs:
##########
@@ -1216,24 +1245,46 @@ impl<'a> ListBuilder<'a> {
 
     /// Finalizes this list and appends it to its parent, which otherwise 
remains unmodified.
     pub fn finish(mut self) {
-        let data_size = self.buffer.offset();
+        let buffer = self.parent_state.buffer();
+
+        let data_size = buffer.offset() - self.parent_value_offset_base;
+
         let num_elements = self.offsets.len();
         let is_large = num_elements > u8::MAX as usize;
         let offset_size = int_size(data_size);
 
-        // Get parent's buffer
-        let parent_buffer = self.parent_state.buffer();
-        let starting_offset = parent_buffer.offset();
+        let starting_offset = self.parent_value_offset_base;
 
         // Write header
         let header = array_header(is_large, offset_size);
-        parent_buffer.append_header(header, is_large, num_elements);
 
-        // Write out the offset array followed by the value bytes
-        let offsets = std::mem::take(&mut self.offsets);
-        parent_buffer.append_offset_array(offsets, Some(data_size), 
offset_size);
-        parent_buffer.append_slice(self.buffer.inner());
+        let num_elements_bytes =
+            num_elements
+                .to_le_bytes()
+                .into_iter()
+                .take(if is_large { 4 } else { 1 });
+        let offsets = PackedU32Iterator::new(
+            offset_size as usize,
+            self.offsets
+                .clone()

Review Comment:
   Just do `self.offsets.iter().map(|&offset| ...)`, relying on the fact that 
u32 is Copy -- instead of cloning the whole vec?



##########
parquet-variant/src/builder.rs:
##########
@@ -1216,24 +1245,46 @@ impl<'a> ListBuilder<'a> {
 
     /// Finalizes this list and appends it to its parent, which otherwise 
remains unmodified.
     pub fn finish(mut self) {
-        let data_size = self.buffer.offset();
+        let buffer = self.parent_state.buffer();
+
+        let data_size = buffer.offset() - self.parent_value_offset_base;
+
         let num_elements = self.offsets.len();
         let is_large = num_elements > u8::MAX as usize;
         let offset_size = int_size(data_size);
 
-        // Get parent's buffer
-        let parent_buffer = self.parent_state.buffer();
-        let starting_offset = parent_buffer.offset();
+        let starting_offset = self.parent_value_offset_base;
 
         // Write header
         let header = array_header(is_large, offset_size);
-        parent_buffer.append_header(header, is_large, num_elements);
 
-        // Write out the offset array followed by the value bytes
-        let offsets = std::mem::take(&mut self.offsets);
-        parent_buffer.append_offset_array(offsets, Some(data_size), 
offset_size);
-        parent_buffer.append_slice(self.buffer.inner());
+        let num_elements_bytes =
+            num_elements
+                .to_le_bytes()
+                .into_iter()
+                .take(if is_large { 4 } else { 1 });
+        let offsets = PackedU32Iterator::new(
+            offset_size as usize,
+            self.offsets
+                .clone()

Review Comment:
   Or,
   ```rust
   let offsets = std::mem::take(self.offsets).into_iter();
   let offsets = offsets.map(|offset| (offset as u32).to_le_bytes());
   let offsets = Packedu32Iterator::new(offset_size as usize, offsets);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to