scovich commented on code in PR #7935:
URL: https://github.com/apache/arrow-rs/pull/7935#discussion_r2217431963


##########
parquet-variant/src/builder.rs:
##########
@@ -1275,38 +1330,80 @@ impl<'a> ObjectBuilder<'a> {
             )));
         }
 
-        let data_size = self.buffer.offset();
-        let num_fields = self.fields.len();
-        let is_large = num_fields > u8::MAX as usize;
+        let metadata_builder = self.parent_state.metadata_builder();
 
         self.fields.sort_by(|&field_a_id, _, &field_b_id, _| {
-            let key_a = &metadata_builder.field_name(field_a_id as usize);
-            let key_b = &metadata_builder.field_name(field_b_id as usize);
-            key_a.cmp(key_b)
+            let field_a_name = metadata_builder.field_name(field_a_id as 
usize);
+            let field_b_name = metadata_builder.field_name(field_b_id as 
usize);
+            field_a_name.cmp(field_b_name)
         });
-
         let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0);
-
         let id_size = int_size(max_id as usize);
-        let offset_size = int_size(data_size);
 
-        // Get parent's buffer
         let parent_buffer = self.parent_state.buffer();
-        let starting_offset = parent_buffer.offset();
+        let current_offset = parent_buffer.offset();
+        // current object starts from `object_start_offset`
+        let data_size = current_offset - self.object_start_offset;
+        let offset_size = int_size(data_size);
 
-        // Write header
+        let num_fields = self.fields.len();
+        let is_large = num_fields > u8::MAX as usize;
+
+        let header_size = 1 + // header byte
+            (if is_large { 4 } else { 1 }) + // num_fields
+            (num_fields * id_size as usize) + // field IDs
+            ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+
+        let starting_offset = self.object_start_offset;
+
+        // Shift existing data to make room for the header
+        let buffer = parent_buffer.inner_mut();
+        buffer.splice(starting_offset..starting_offset, vec![0u8; 
header_size]);

Review Comment:
   FYI, I think we can improve the performance of appending packed `u32` values 
to a `Vec<u8>`. The following code:
   ```rust
   pub fn append_bytes_brute2(dest: &mut Vec<u8>, src: u32, nbytes: NumBytes) {
       let n = dest.len() + nbytes as usize;
       dest.extend(src.to_le_bytes());
       dest.truncate(n);
   }
   ```
   ... unconditionally adds all four bytes and then truncates the vector to the 
desired length. This works because (a) it's a single machine instruction to 
copy the four LE bytes of a `u32` to a memory location; (b) truncate drops the 
higher order bytes, when working with LE bytes; and (c) `truncate` is dirt 
cheap (conditional move that doesn't break processor pipelines the way a branch 
would). 
   
   Result: Code that should perform the same regardless of the number of bytes 
we encode the `u32` as, with only a single branch to check for vector capacity.
   
   <details>
   
   The above 
[compiles](https://play.rust-lang.org/?version=stable&mode=release&edition=2024&gist=c4e3f95d39ad58148435d55ff7095a3a)
 to the following assembly code:
   ```asm
   playground::append_packed_u32:
        ;; set up stack frame (not relevant in practice due to inlining)
          ...
        movq    (%rdi), %rcx      ; dest capacity
        movq    16(%rdi), %rbx    ; dest len
        subq    %rbx, %rcx        ; calculate available capacity
        movq    %rbx, %rax
        cmpq    $3, %rcx          ; if insufficient capacity...
        jbe     .LBB2_1           ; ... then `reserve` more
   
   .LBB2_2:
        addq    %rdx, %rbx        ; add nbytes to len (truncate arg)
        movq    8(%rdi), %rcx     ; dest data pointer
        movl    %esi, (%rcx,%rax) ; append all four bytes of src to the vector
        addq    $4, %rax          ; increase len by 4 (u32 size)
        cmpq    %rax, %rbx        ; if new len is bigger than truncate arg...
        cmovbq  %rbx, %rax        ; ... then truncate len
        movq    %rax, 16(%rdi)    ; write back the updated dest len
        ;; tear down stack frame
          ...
        retq
   
   .LBB2_1:
        ;; call prologue
          ... 
        callq   alloc::raw_vec::RawVecInner<A>::reserve::do_reserve_and_handle
        ;; call epilogue
          ...
        jmp     .LBB2_2
   ```
   
   The one bummer is, when adding a `reserve` followed by a loop over a slice 
of source bytes:
   ```rust
   pub fn append_packed_u32(dest: &mut Vec<u8>, src: &[u32], nbytes: usize) {
       dest.reserve((src.len() + 1) * nbytes);
       for val in src {
           let n = dest.len() + nbytes;
           dest.extend(val.to_le_bytes());
           dest.truncate(n);
       }
   }
   ```
   ... the 
[compiler](https://play.rust-lang.org/?version=stable&mode=release&edition=2024&gist=e0502909fe267e592ba37ede6e048281)
 isn't smart enough to eliminate the redundant capacity check inside the loop. 
Oh, well.
   
   </details>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to