klion26 commented on code in PR #7935:
URL: https://github.com/apache/arrow-rs/pull/7935#discussion_r2217766357


##########
parquet-variant/src/builder.rs:
##########
@@ -598,6 +599,49 @@ impl ParentState<'_> {
             }
         }
     }
+
+    // returns the beginning offset of buffer for the parent if it is object 
builder, else 0.
+    // for object builder will reuse the buffer from the parent, this is 
needed for `finish`
+    // which needs the relative offset from the current variant.
+    fn object_start_offset(&self) -> usize {

Review Comment:
   This feature that reuse the parent buffer hasn't been implemented for 
`ListBuilder`. Will do it with a follow-up pr.



##########
parquet-variant/src/builder.rs:
##########
@@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> {
             )));
         }
 
-        let data_size = self.buffer.offset();
-        let num_fields = self.fields.len();
-        let is_large = num_fields > u8::MAX as usize;
+        let metadata_builder = self.parent_state.metadata_builder();
 
         self.fields.sort_by(|&field_a_id, _, &field_b_id, _| {
-            let key_a = &metadata_builder.field_name(field_a_id as usize);
-            let key_b = &metadata_builder.field_name(field_b_id as usize);
-            key_a.cmp(key_b)
+            let field_a_name = metadata_builder.field_name(field_a_id as 
usize);
+            let field_b_name = metadata_builder.field_name(field_b_id as 
usize);
+            field_a_name.cmp(field_b_name)
         });
-
         let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0);
-
         let id_size = int_size(max_id as usize);
-        let offset_size = int_size(data_size);
 
-        // Get parent's buffer
         let parent_buffer = self.parent_state.buffer();
-        let starting_offset = parent_buffer.offset();
+        let current_offset = parent_buffer.offset();
+        // current object starts from `object_start_offset`
+        let data_size = current_offset - self.object_start_offset;
+        let offset_size = int_size(data_size);
 
-        // Write header
+        let num_fields = self.fields.len();
+        let is_large = num_fields > u8::MAX as usize;
+
+        let header_size = 1 + // header byte
+            (if is_large { 4 } else { 1 }) + // num_fields
+            (num_fields * id_size as usize) + // field IDs
+            ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+
+        let starting_offset = self.object_start_offset;
+
+        // Shift existing data to make room for the header
+        let buffer = parent_buffer.inner_mut();
+        buffer.splice(starting_offset..starting_offset, vec![0u8; 
header_size]);
+
+        // Write header at the original start position
+        let mut header_pos = starting_offset;
+
+        // Write header byte
         let header = object_header(is_large, id_size, offset_size);
-        parent_buffer.append_header(header, is_large, num_fields);
+        buffer[header_pos] = header;
+        header_pos += 1;
+
+        // Write number of fields
+        if is_large {
+            buffer[header_pos..header_pos + 4].copy_from_slice(&(num_fields as 
u32).to_le_bytes());
+            header_pos += 4;
+        } else {
+            buffer[header_pos] = num_fields as u8;
+            header_pos += 1;
+        }
 
-        // Write field IDs (sorted order)
-        let ids = self.fields.keys().map(|id| *id as usize);
-        parent_buffer.append_offset_array(ids, None, id_size);
+        // Write field IDs
+        for (&field_id, _) in &self.fields {
+            let id_bytes = (field_id as usize).to_le_bytes();
+            buffer[header_pos..header_pos + id_size as usize]
+                .copy_from_slice(&id_bytes[..id_size as usize]);
+            header_pos += id_size as usize;
+        }
 
-        // Write the field offset array, followed by the value bytes
-        let offsets = std::mem::take(&mut self.fields).into_values();
-        parent_buffer.append_offset_array(offsets, Some(data_size), 
offset_size);
-        parent_buffer.append_slice(self.buffer.inner());
-        self.parent_state.finish(starting_offset);
+        // Write field offsets (adjusted for header)
+        for (_, &relative_offset) in &self.fields {
+            let offset_bytes = relative_offset.to_le_bytes();
+            buffer[header_pos..header_pos + offset_size as usize]
+                .copy_from_slice(&offset_bytes[..offset_size as usize]);
+            header_pos += offset_size as usize;
+        }
+
+        // Write data_size
+        let data_size_bytes = data_size.to_le_bytes();
+        buffer[header_pos..header_pos + offset_size as usize]
+            .copy_from_slice(&data_size_bytes[..offset_size as usize]);
+
+        let start_offset_shift = self.parent_state.object_start_offset();
+        self.parent_state
+            .finish(starting_offset - start_offset_shift);

Review Comment:
   Yes, moving this logic to `ParentState::finish` is clearer. fixed.



##########
parquet-variant/src/builder.rs:
##########
@@ -1317,7 +1414,15 @@ impl<'a> ObjectBuilder<'a> {
 /// This is to ensure that the object is always finalized before its parent 
builder
 /// is finalized.
 impl Drop for ObjectBuilder<'_> {
-    fn drop(&mut self) {}
+    fn drop(&mut self) {
+        // truncate the buffer if the `finish` method has not been called.
+        if !self.has_been_finished {
+            self.parent_state
+                .buffer()
+                .inner_mut()
+                .truncate(self.object_start_offset);
+        }

Review Comment:
   Seems that in previous pr #7865, some logics want to cover that the metadata 
hasn't been rolled back in tests like 
`test_list_builder_to_object_builder_inner_no_finish`, do we need to roll back 
the `MetadataBuilder` here? 
   
   I update this into a seperate commit, we can keep it or revert it easily.
   
   IIUC, the `Metadata` can be rolled back because the object has not been 
written successfully, but not sure if there are any cases I did not follow here.
   
   ```
           // The parent list should only contain the original values
           list_builder.finish();
           let (metadata, value) = builder.finish();
           let metadata = VariantMetadata::try_new(&metadata).unwrap();
           assert_eq!(metadata.len(), 1);
           assert_eq!(&metadata[0], "name"); // not rolled back
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> {
             )));
         }
 
-        let data_size = self.buffer.offset();
-        let num_fields = self.fields.len();
-        let is_large = num_fields > u8::MAX as usize;
+        let metadata_builder = self.parent_state.metadata_builder();
 
         self.fields.sort_by(|&field_a_id, _, &field_b_id, _| {
-            let key_a = &metadata_builder.field_name(field_a_id as usize);
-            let key_b = &metadata_builder.field_name(field_b_id as usize);
-            key_a.cmp(key_b)
+            let field_a_name = metadata_builder.field_name(field_a_id as 
usize);
+            let field_b_name = metadata_builder.field_name(field_b_id as 
usize);
+            field_a_name.cmp(field_b_name)
         });
-
         let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0);

Review Comment:
   Using the length of the metadata builder doesn't have a correctness problem, 
but it will have a tiny problem that will waste some space for the header. 
added in a separate commit.



##########
parquet-variant/src/builder.rs:
##########
@@ -598,6 +599,49 @@ impl ParentState<'_> {
             }
         }
     }
+
+    // returns the beginning offset of buffer for the parent if it is object 
builder, else 0.
+    // for object builder will reuse the buffer from the parent, this is 
needed for `finish`
+    // which needs the relative offset from the current variant.
+    fn object_start_offset(&self) -> usize {
+        match self {
+            ParentState::Object {
+                object_start_offset,
+                ..
+            } => *object_start_offset,
+            _ => 0,
+        }
+    }
+
+    /// Return mutable references to the buffer and metadata builder that this
+    /// parent state is using.
+    fn buffer_and_metadata_builder(&mut self) -> (&mut ValueBuffer, &mut 
MetadataBuilder) {
+        match self {
+            ParentState::Variant {
+                buffer,
+                metadata_builder,
+            } => (buffer, metadata_builder),
+            ParentState::List {
+                buffer,
+                metadata_builder,
+                ..
+            } => (buffer, metadata_builder),
+            ParentState::Object {
+                buffer,
+                metadata_builder,
+                ..
+            } => (buffer, metadata_builder),

Review Comment:
   addressed



##########
parquet-variant/src/builder.rs:
##########
@@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> {
             )));
         }
 
-        let data_size = self.buffer.offset();
-        let num_fields = self.fields.len();
-        let is_large = num_fields > u8::MAX as usize;
+        let metadata_builder = self.parent_state.metadata_builder();
 
         self.fields.sort_by(|&field_a_id, _, &field_b_id, _| {
-            let key_a = &metadata_builder.field_name(field_a_id as usize);
-            let key_b = &metadata_builder.field_name(field_b_id as usize);
-            key_a.cmp(key_b)
+            let field_a_name = metadata_builder.field_name(field_a_id as 
usize);
+            let field_b_name = metadata_builder.field_name(field_b_id as 
usize);
+            field_a_name.cmp(field_b_name)
         });
-
         let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0);
-
         let id_size = int_size(max_id as usize);
-        let offset_size = int_size(data_size);
 
-        // Get parent's buffer
         let parent_buffer = self.parent_state.buffer();
-        let starting_offset = parent_buffer.offset();
+        let current_offset = parent_buffer.offset();
+        // current object starts from `object_start_offset`
+        let data_size = current_offset - self.object_start_offset;
+        let offset_size = int_size(data_size);
 
-        // Write header
+        let num_fields = self.fields.len();
+        let is_large = num_fields > u8::MAX as usize;
+
+        let header_size = 1 + // header byte
+            (if is_large { 4 } else { 1 }) + // num_fields
+            (num_fields * id_size as usize) + // field IDs
+            ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size

Review Comment:
   Do you think it's ok to make this optimization a follow-up pr and add some 
benchmark for it?



##########
parquet-variant/src/builder.rs:
##########
@@ -598,6 +599,49 @@ impl ParentState<'_> {
             }
         }
     }
+
+    // returns the beginning offset of buffer for the parent if it is object 
builder, else 0.
+    // for object builder will reuse the buffer from the parent, this is 
needed for `finish`
+    // which needs the relative offset from the current variant.
+    fn object_start_offset(&self) -> usize {
+        match self {
+            ParentState::Object {
+                object_start_offset,
+                ..
+            } => *object_start_offset,
+            _ => 0,
+        }
+    }
+
+    /// Return mutable references to the buffer and metadata builder that this
+    /// parent state is using.
+    fn buffer_and_metadata_builder(&mut self) -> (&mut ValueBuffer, &mut 
MetadataBuilder) {
+        match self {
+            ParentState::Variant {
+                buffer,
+                metadata_builder,
+            } => (buffer, metadata_builder),
+            ParentState::List {
+                buffer,
+                metadata_builder,
+                ..
+            } => (buffer, metadata_builder),
+            ParentState::Object {
+                buffer,
+                metadata_builder,
+                ..
+            } => (buffer, metadata_builder),
+        }
+    }
+
+    // return the offset of the underlying buffer at the time of calling this 
method.
+    fn buffer_current_offset(&self) -> usize {
+        match self {
+            ParentState::Variant { buffer, .. } => buffer.offset(),
+            ParentState::Object { buffer, .. } => buffer.offset(),
+            ParentState::List { buffer, .. } => buffer.offset(),
+        }

Review Comment:
   This needs the signature changed to `buffer_current_offset(&mut self)`, not 
sure if this is ok?
   
   changed to the first version



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to