scovich commented on code in PR #7987: URL: https://github.com/apache/arrow-rs/pull/7987#discussion_r2229500164
########## parquet-variant/src/builder.rs: ########## @@ -1115,22 +1086,75 @@ impl VariantBuilder { } } +/// An iterator that yields the bytes of a packed u32 iterator. +/// Will yield the first `packed_bytes` bytes of each item in the iterator. +struct PackedU32Iterator<T: Iterator<Item = [u8; 4]>> { + packed_bytes: usize, + iterator: T, + current_item: [u8; 4], + current_byte: usize, // 0..3 +} + +impl<T: Iterator<Item = [u8; 4]>> PackedU32Iterator<T> { + fn new(packed_bytes: usize, iterator: T) -> Self { + // eliminate corner cases in `next` by initializing with a fake already-consumed "first" item + Self { + packed_bytes, + iterator, + current_item: [0; 4], + current_byte: packed_bytes, + } + } +} + +impl<T: Iterator<Item = [u8; 4]>> Iterator for PackedU32Iterator<T> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + if self.current_byte >= self.packed_bytes { + let next_item = self.iterator.next()?; + self.current_item = next_item; Review Comment: Split into two statements due to lifetime issues, I suppose? ########## parquet-variant/src/builder.rs: ########## @@ -1216,24 +1245,46 @@ impl<'a> ListBuilder<'a> { /// Finalizes this list and appends it to its parent, which otherwise remains unmodified. pub fn finish(mut self) { - let data_size = self.buffer.offset(); + let buffer = self.parent_state.buffer(); + + let data_size = buffer.offset() - self.parent_value_offset_base; + let num_elements = self.offsets.len(); let is_large = num_elements > u8::MAX as usize; let offset_size = int_size(data_size); - // Get parent's buffer - let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let starting_offset = self.parent_value_offset_base; // Write header let header = array_header(is_large, offset_size); - parent_buffer.append_header(header, is_large, num_elements); - // Write out the offset array followed by the value bytes - let offsets = std::mem::take(&mut self.offsets); - parent_buffer.append_offset_array(offsets, Some(data_size), offset_size); - parent_buffer.append_slice(self.buffer.inner()); + let num_elements_bytes = + num_elements + .to_le_bytes() + .into_iter() + .take(if is_large { 4 } else { 1 }); + let offsets = PackedU32Iterator::new( + offset_size as usize, + self.offsets + .clone() Review Comment: Just do `self.offsets.iter().map(|&offset| ...)`, relying on the fact that u32 is Copy -- instead of cloning the whole vec? ########## parquet-variant/src/builder.rs: ########## @@ -1216,24 +1245,46 @@ impl<'a> ListBuilder<'a> { /// Finalizes this list and appends it to its parent, which otherwise remains unmodified. pub fn finish(mut self) { - let data_size = self.buffer.offset(); + let buffer = self.parent_state.buffer(); + + let data_size = buffer.offset() - self.parent_value_offset_base; + let num_elements = self.offsets.len(); let is_large = num_elements > u8::MAX as usize; let offset_size = int_size(data_size); - // Get parent's buffer - let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let starting_offset = self.parent_value_offset_base; // Write header let header = array_header(is_large, offset_size); - parent_buffer.append_header(header, is_large, num_elements); - // Write out the offset array followed by the value bytes - let offsets = std::mem::take(&mut self.offsets); - parent_buffer.append_offset_array(offsets, Some(data_size), offset_size); - parent_buffer.append_slice(self.buffer.inner()); + let num_elements_bytes = + num_elements + .to_le_bytes() + .into_iter() + .take(if is_large { 4 } else { 1 }); + let offsets = PackedU32Iterator::new( + offset_size as usize, + self.offsets + .clone() Review Comment: Or, ```rust let offsets = std::mem::take(self.offsets).into_iter(); let offsets = offsets.map(|offset| (offset as u32).to_le_bytes()); let offsets = Packedu32Iterator::new(offset_size as usize, offsets); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org