scovich commented on code in PR #7935: URL: https://github.com/apache/arrow-rs/pull/7935#discussion_r2220120562
########## parquet-variant/src/builder.rs: ########## @@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = self.parent_state.metadata_builder(); self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); Review Comment: Why would it waste space? With high probability, the largest field id in the object will take the same number of bytes to encode as `self.fields.len() - 1` (the highest field id in the dictionary), so using the latter shouldn't change anything? ########## parquet-variant/src/builder.rs: ########## @@ -1176,7 +1335,20 @@ impl<'a> ObjectBuilder<'a> { /// This is to ensure that the object is always finalized before its parent builder /// is finalized. impl Drop for ObjectBuilder<'_> { - fn drop(&mut self) {} + fn drop(&mut self) { + // Truncate the buffer if the `finish` method has not been called. + if !self.has_been_finished { + self.parent_state + .buffer() + .inner_mut() + .truncate(self.parent_offset_base); Review Comment: now that we have two offset bases, should we rename this one as `parent_value_offset_base` for clarity? ########## parquet-variant/src/builder.rs: ########## @@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = self.parent_state.metadata_builder(); self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); - let id_size = int_size(max_id as usize); - let offset_size = int_size(data_size); - // Get parent's buffer let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let current_offset = parent_buffer.offset(); + // current object starts from `object_start_offset` + let data_size = current_offset - self.object_start_offset; + let offset_size = int_size(data_size); - // Write header + let num_fields = self.fields.len(); + let is_large = num_fields > u8::MAX as usize; + + let header_size = 1 + // header byte + (if is_large { 4 } else { 1 }) + // num_fields + (num_fields * id_size as usize) + // field IDs + ((num_fields + 1) * offset_size as usize); // field offsets + data_size Review Comment: Or try the other approach that populates the tmp vec with non-zero bytes before splicing it into the main buffer? That's a lot simpler than this iterator-based suggestion, and likely performs better too. ########## parquet-variant/src/builder.rs: ########## @@ -2253,8 +2620,7 @@ mod tests { // The original builder should be unchanged let (metadata, value) = builder.finish(); let metadata = VariantMetadata::try_new(&metadata).unwrap(); - assert_eq!(metadata.len(), 1); - assert_eq!(&metadata[0], "name"); // not rolled back + assert!(metadata.is_empty()); // rolled back Review Comment: yay! 🎉 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org