viirya commented on code in PR #7935:
URL: https://github.com/apache/arrow-rs/pull/7935#discussion_r2217915311


##########
parquet-variant/src/builder.rs:
##########
@@ -1118,8 +1181,8 @@ impl<'a> ObjectBuilder<'a> {
 
     /// Finalizes this object and appends it to its parent, which otherwise 
remains unmodified.
     pub fn finish(mut self) -> Result<(), ArrowError> {
-        let metadata_builder = self.parent_state.metadata_builder();
         if self.validate_unique_fields && !self.duplicate_fields.is_empty() {
+            let metadata_builder = self.parent_state.metadata_builder();

Review Comment:
   Hm? Why move `metadata_builder` inside?



##########
parquet-variant/src/builder.rs:
##########
@@ -1028,18 +1078,29 @@ impl Drop for ListBuilder<'_> {
 pub struct ObjectBuilder<'a> {
     parent_state: ParentState<'a>,
     fields: IndexMap<u32, usize>, // (field_id, offset)
-    buffer: ValueBuffer,
+    /// the starting offset in the parent's buffer where this object starts
+    object_start_offset: usize,

Review Comment:
   ```suggestion
       parent_offset_base: usize,
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -1000,6 +1048,8 @@ impl<'a> ListBuilder<'a> {
 
         // Get parent's buffer
         let parent_buffer = self.parent_state.buffer();
+        // as object builder has been reused the parent buffer,
+        // we need to shift the offset by the starting offset of the parent 
object

Review Comment:
   I don't get it why put the comment here, seems `ListBuilder.finish` hasn't 
been updated.



##########
parquet-variant/src/builder.rs:
##########
@@ -506,6 +506,7 @@ enum ParentState<'a> {
         metadata_builder: &'a mut MetadataBuilder,
         fields: &'a mut IndexMap<u32, usize>,
         field_name: &'a str,
+        object_start_offset: usize,

Review Comment:
   ```suggestion
           parent_offset_base: usize,
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -1028,18 +1078,29 @@ impl Drop for ListBuilder<'_> {
 pub struct ObjectBuilder<'a> {
     parent_state: ParentState<'a>,
     fields: IndexMap<u32, usize>, // (field_id, offset)
-    buffer: ValueBuffer,
+    /// the starting offset in the parent's buffer where this object starts

Review Comment:
   Style suggestion, it'd be better just following existing doc style.
   
   ```suggestion
       /// The starting offset in the parent's buffer where this object starts
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -1028,18 +1078,29 @@ impl Drop for ListBuilder<'_> {
 pub struct ObjectBuilder<'a> {
     parent_state: ParentState<'a>,
     fields: IndexMap<u32, usize>, // (field_id, offset)
-    buffer: ValueBuffer,
+    /// the starting offset in the parent's buffer where this object starts
+    object_start_offset: usize,
+    /// the starting offset in the parent's metadata buffer where this object 
starts
+    /// used to truncate the written fields in `drop` if the current object 
has not been finished
+    object_meta_start_offset: usize,

Review Comment:
   ```suggestion
       parent_metadata_offset_base: usize,
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -1134,39 +1197,86 @@ impl<'a> ObjectBuilder<'a> {
             )));
         }
 
-        let data_size = self.buffer.offset();
-        let num_fields = self.fields.len();
-        let is_large = num_fields > u8::MAX as usize;
+        let metadata_builder = self.parent_state.metadata_builder();
 
         self.fields.sort_by(|&field_a_id, _, &field_b_id, _| {
-            let key_a = &metadata_builder.field_name(field_a_id as usize);
-            let key_b = &metadata_builder.field_name(field_b_id as usize);
-            key_a.cmp(key_b)
+            let field_a_name = metadata_builder.field_name(field_a_id as 
usize);
+            let field_b_name = metadata_builder.field_name(field_b_id as 
usize);
+            field_a_name.cmp(field_b_name)
         });
 
-        let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0);
+        // the length of the metadata's field names is a very cheap to compute 
the upper bound.
+        // it will almost always be a tight upper bound as well -- it would 
take a pretty
+        // carefully  crafted object to use only the early field ids of a 
large dictionary.
+        let max_id = metadata_builder.field_names.len();
+        let id_size = int_size(max_id);
 
-        let id_size = int_size(max_id as usize);
+        let parent_buffer = self.parent_state.buffer();
+        let current_offset = parent_buffer.offset();
+        // current object starts from `object_start_offset`
+        let data_size = current_offset - self.object_start_offset;

Review Comment:
   Hmm, so we assume that no other object was appended at the same time? If we 
create two object builders and they insert into the parent buffer in an 
interleaved manner? Won't be the base offset (object start offset) same for 
them?



##########
parquet-variant/src/builder.rs:
##########
@@ -1028,18 +1078,29 @@ impl Drop for ListBuilder<'_> {
 pub struct ObjectBuilder<'a> {
     parent_state: ParentState<'a>,
     fields: IndexMap<u32, usize>, // (field_id, offset)
-    buffer: ValueBuffer,
+    /// the starting offset in the parent's buffer where this object starts

Review Comment:
   Similar style issues on other comment/doc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to