klion26 commented on code in PR #7935: URL: https://github.com/apache/arrow-rs/pull/7935#discussion_r2217766357
########## parquet-variant/src/builder.rs: ########## @@ -598,6 +599,49 @@ impl ParentState<'_> { } } } + + // returns the beginning offset of buffer for the parent if it is object builder, else 0. + // for object builder will reuse the buffer from the parent, this is needed for `finish` + // which needs the relative offset from the current variant. + fn object_start_offset(&self) -> usize { Review Comment: This feature that reuse the parent buffer hasn't been implemented for `ListBuilder`. Will do it with a follow-up pr. ########## parquet-variant/src/builder.rs: ########## @@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = self.parent_state.metadata_builder(); self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); - let id_size = int_size(max_id as usize); - let offset_size = int_size(data_size); - // Get parent's buffer let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let current_offset = parent_buffer.offset(); + // current object starts from `object_start_offset` + let data_size = current_offset - self.object_start_offset; + let offset_size = int_size(data_size); - // Write header + let num_fields = self.fields.len(); + let is_large = num_fields > u8::MAX as usize; + + let header_size = 1 + // header byte + (if is_large { 4 } else { 1 }) + // num_fields + (num_fields * id_size as usize) + // field IDs + ((num_fields + 1) * offset_size as usize); // field offsets + data_size + + let starting_offset = self.object_start_offset; + + // Shift existing data to make room for the header + let buffer = parent_buffer.inner_mut(); + buffer.splice(starting_offset..starting_offset, vec![0u8; header_size]); + + // Write header at the original start position + let mut header_pos = starting_offset; + + // Write header byte let header = object_header(is_large, id_size, offset_size); - parent_buffer.append_header(header, is_large, num_fields); + buffer[header_pos] = header; + header_pos += 1; + + // Write number of fields + if is_large { + buffer[header_pos..header_pos + 4].copy_from_slice(&(num_fields as u32).to_le_bytes()); + header_pos += 4; + } else { + buffer[header_pos] = num_fields as u8; + header_pos += 1; + } - // Write field IDs (sorted order) - let ids = self.fields.keys().map(|id| *id as usize); - parent_buffer.append_offset_array(ids, None, id_size); + // Write field IDs + for (&field_id, _) in &self.fields { + let id_bytes = (field_id as usize).to_le_bytes(); + buffer[header_pos..header_pos + id_size as usize] + .copy_from_slice(&id_bytes[..id_size as usize]); + header_pos += id_size as usize; + } - // Write the field offset array, followed by the value bytes - let offsets = std::mem::take(&mut self.fields).into_values(); - parent_buffer.append_offset_array(offsets, Some(data_size), offset_size); - parent_buffer.append_slice(self.buffer.inner()); - self.parent_state.finish(starting_offset); + // Write field offsets (adjusted for header) + for (_, &relative_offset) in &self.fields { + let offset_bytes = relative_offset.to_le_bytes(); + buffer[header_pos..header_pos + offset_size as usize] + .copy_from_slice(&offset_bytes[..offset_size as usize]); + header_pos += offset_size as usize; + } + + // Write data_size + let data_size_bytes = data_size.to_le_bytes(); + buffer[header_pos..header_pos + offset_size as usize] + .copy_from_slice(&data_size_bytes[..offset_size as usize]); + + let start_offset_shift = self.parent_state.object_start_offset(); + self.parent_state + .finish(starting_offset - start_offset_shift); Review Comment: Yes, moving this logic to `ParentState::finish` is clearer. fixed. ########## parquet-variant/src/builder.rs: ########## @@ -1317,7 +1414,15 @@ impl<'a> ObjectBuilder<'a> { /// This is to ensure that the object is always finalized before its parent builder /// is finalized. impl Drop for ObjectBuilder<'_> { - fn drop(&mut self) {} + fn drop(&mut self) { + // truncate the buffer if the `finish` method has not been called. + if !self.has_been_finished { + self.parent_state + .buffer() + .inner_mut() + .truncate(self.object_start_offset); + } Review Comment: Seems that in previous pr #7865, some logics want to cover that the metadata hasn't been rolled back in tests like `test_list_builder_to_object_builder_inner_no_finish`, do we need to roll back the `MetadataBuilder` here? I update this into a seperate commit, we can keep it or revert it easily. IIUC, the `Metadata` can be rolled back because the object has not been written successfully, but not sure if there are any cases I did not follow here. ``` // The parent list should only contain the original values list_builder.finish(); let (metadata, value) = builder.finish(); let metadata = VariantMetadata::try_new(&metadata).unwrap(); assert_eq!(metadata.len(), 1); assert_eq!(&metadata[0], "name"); // not rolled back ``` ########## parquet-variant/src/builder.rs: ########## @@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = self.parent_state.metadata_builder(); self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); Review Comment: Using the length of the metadata builder doesn't have a correctness problem, but it will have a tiny problem that will waste some space for the header. added in a separate commit. ########## parquet-variant/src/builder.rs: ########## @@ -598,6 +599,49 @@ impl ParentState<'_> { } } } + + // returns the beginning offset of buffer for the parent if it is object builder, else 0. + // for object builder will reuse the buffer from the parent, this is needed for `finish` + // which needs the relative offset from the current variant. + fn object_start_offset(&self) -> usize { + match self { + ParentState::Object { + object_start_offset, + .. + } => *object_start_offset, + _ => 0, + } + } + + /// Return mutable references to the buffer and metadata builder that this + /// parent state is using. + fn buffer_and_metadata_builder(&mut self) -> (&mut ValueBuffer, &mut MetadataBuilder) { + match self { + ParentState::Variant { + buffer, + metadata_builder, + } => (buffer, metadata_builder), + ParentState::List { + buffer, + metadata_builder, + .. + } => (buffer, metadata_builder), + ParentState::Object { + buffer, + metadata_builder, + .. + } => (buffer, metadata_builder), Review Comment: addressed ########## parquet-variant/src/builder.rs: ########## @@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = self.parent_state.metadata_builder(); self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); - let id_size = int_size(max_id as usize); - let offset_size = int_size(data_size); - // Get parent's buffer let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let current_offset = parent_buffer.offset(); + // current object starts from `object_start_offset` + let data_size = current_offset - self.object_start_offset; + let offset_size = int_size(data_size); - // Write header + let num_fields = self.fields.len(); + let is_large = num_fields > u8::MAX as usize; + + let header_size = 1 + // header byte + (if is_large { 4 } else { 1 }) + // num_fields + (num_fields * id_size as usize) + // field IDs + ((num_fields + 1) * offset_size as usize); // field offsets + data_size Review Comment: Do you think it's ok to make this optimization a follow-up pr and add some benchmark for it? ########## parquet-variant/src/builder.rs: ########## @@ -598,6 +599,49 @@ impl ParentState<'_> { } } } + + // returns the beginning offset of buffer for the parent if it is object builder, else 0. + // for object builder will reuse the buffer from the parent, this is needed for `finish` + // which needs the relative offset from the current variant. + fn object_start_offset(&self) -> usize { + match self { + ParentState::Object { + object_start_offset, + .. + } => *object_start_offset, + _ => 0, + } + } + + /// Return mutable references to the buffer and metadata builder that this + /// parent state is using. + fn buffer_and_metadata_builder(&mut self) -> (&mut ValueBuffer, &mut MetadataBuilder) { + match self { + ParentState::Variant { + buffer, + metadata_builder, + } => (buffer, metadata_builder), + ParentState::List { + buffer, + metadata_builder, + .. + } => (buffer, metadata_builder), + ParentState::Object { + buffer, + metadata_builder, + .. + } => (buffer, metadata_builder), + } + } + + // return the offset of the underlying buffer at the time of calling this method. + fn buffer_current_offset(&self) -> usize { + match self { + ParentState::Variant { buffer, .. } => buffer.offset(), + ParentState::Object { buffer, .. } => buffer.offset(), + ParentState::List { buffer, .. } => buffer.offset(), + } Review Comment: This needs the signature changed to `buffer_current_offset(&mut self)`, not sure if this is ok? changed to the first version -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org