[PR] perf: improve GenericByteBuilder::append_array to use SIMD for extending the offsets [arrow-rs]

via GitHub Thu, 18 Sep 2025 13:19:24 -0700


rluvaton opened a new pull request, #8388:
URL: https://github.com/apache/arrow-rs/pull/8388


   # Which issue does this PR close?
   
   N/A
   
   # Rationale for this change
   
   Just making things faster
   
   # What changes are included in this PR?
   
   Explained below
   
   # Are these changes tested?
   
   Existing tests
   
   # Are there any user-facing changes?
   Nope
   
   ------------
   
   Changing from:
   ```rust
   let mut intermediate = Vec::<T>::with_capacity(offsets.len() - 1);
   for &offset in &offsets[1..] {
       intermediate.push(offset + shift)
   }
   ```
   
   to:
   ```rust
   let mut intermediate = vec![T::Offset::zero(); offsets.len() - 1];
   for (index, &offset) in offsets[1..].iter().enumerate() {
       intermediate[index] = offset + shift;
   }
   ```
   
   improve the performance of concatenating bytes array between 8% to 50% on 
local machine:
   
   <details>
   <summary>Benchmark results for concat</summary>
   
   
   ```bash
   concat str 1024         time:   [7.2598 µs 7.2772 µs 7.2957 µs]
                           change: [+12.545% +13.070% +13.571%] (p = 0.00 < 
0.05)
                           Performance has regressed.
   Found 6 outliers among 100 measurements (6.00%)
     4 (4.00%) high mild
     2 (2.00%) high severe
   
   concat str nulls 1024   time:   [4.6791 µs 4.6895 µs 4.7010 µs]
                           change: [+23.206% +23.792% +24.425%] (p = 0.00 < 
0.05)
                           Performance has regressed.
   Found 13 outliers among 100 measurements (13.00%)
     5 (5.00%) high mild
     8 (8.00%) high severe
   
   concat 1024 arrays str 4
                           time:   [45.018 µs 45.213 µs 45.442 µs]
                           change: [+6.4195% +8.7377% +11.279%] (p = 0.00 < 
0.05)
                           Performance has regressed.
   Found 13 outliers among 100 measurements (13.00%)
     6 (6.00%) high mild
     7 (7.00%) high severe
   
   concat str 8192 over 100 arrays
                           time:   [3.7561 ms 3.7814 ms 3.8086 ms]
                           change: [+25.394% +26.833% +28.370%] (p = 0.00 < 
0.05)
                           Performance has regressed.
   Found 4 outliers among 100 measurements (4.00%)
     4 (4.00%) high mild
   
   concat str nulls 8192 over 100 arrays
                           time:   [2.3144 ms 2.3269 ms 2.3403 ms]
                           change: [+51.533% +52.826% +54.109%] (p = 0.00 < 
0.05)
                           Performance has regressed.
   Found 8 outliers among 100 measurements (8.00%)
     6 (6.00%) high mild
     2 (2.00%) high severe
   ```
   
   </details>
   
   When looking at the assembly
   > Used rustc 1.89.0 and compiler flags `-C opt-level=2 -C 
target-feature=+avx2 -C codegen-units=1` in [godbold](https://godbolt.org/)
   
   you see that for the old code:
   ```rust
   let mut intermediate = Vec::<T>::with_capacity(offsets.len() - 1);
   for &offset in &offsets[1..] {
       intermediate.push(offset + shift)
   }
   ```
   
   the assembly for the loop is:
   ```asm
   .LBB3_22:
           mov     rbx, qword ptr [r13 + 8*rbp + 8]
           add     rbx, r15
           cmp     rbp, qword ptr [rsp]
           jne     .LBB3_25
           mov     rdi, rsp
           lea     rsi, [rip + .Lanon.da681cffc384a5add117668a344b291b.6]
           call    qword ptr [rip + 
alloc::raw_vec::RawVec<T,A>::grow_one::ha1b398ade64b0727@GOTPCREL]
           mov     r14, qword ptr [rsp + 8]
           jmp     .LBB3_25
   
   .LBB3_25:
           mov     qword ptr [r14 + 8*rbp], rbx
           inc     rbp
           mov     qword ptr [rsp + 16], rbp
           add     r12, -8
           je      .LBB3_9
   ```
   
   and for the new code:
   ```rust
   let mut intermediate = vec![T::Offset::zero(); offsets.len() - 1];
   for (index, &offset) in offsets[1..].iter().enumerate() {
       intermediate[index] = offset + shift;
   }
   ```
   
   the assembly for the loop is:
   ```asm
   .LBB2_21:
           vpaddq  ymm1, ymm0, ymmword ptr [r12 + 8*rdx + 8]
           vpaddq  ymm2, ymm0, ymmword ptr [r12 + 8*rdx + 40]
           vpaddq  ymm3, ymm0, ymmword ptr [r12 + 8*rdx + 72]
           vpaddq  ymm4, ymm0, ymmword ptr [r12 + 8*rdx + 104]
           vmovdqu ymmword ptr [rbx + 8*rdx], ymm1
           vmovdqu ymmword ptr [rbx + 8*rdx + 32], ymm2
           vmovdqu ymmword ptr [rbx + 8*rdx + 64], ymm3
           vmovdqu ymmword ptr [rbx + 8*rdx + 96], ymm4
           add     rdx, 16
           cmp     rax, rdx
           jne     .LBB2_21
   ```
   
   which uses SIMD instructions.
   
   <details>
   <summary>The code that I wrote in GodBolt:</summary>
   
   
   For the old code:
   ```rust
   
   #[inline(always)]
   fn extend_offsets<T: std::ops::Add<Output = T> + Copy + Default>(output: 
&mut Vec<T>, offsets: &[T], next_offset: T) {
       assert_ne!(offsets.len(), 0);
       let shift: T = next_offset + offsets[0];
   
       let mut intermediate = Vec::<T>::with_capacity(offsets.len() - 1);
   
       // Make it easier to find the loop in the assembly
       let mut dummy = 0u64;
       unsafe {
           std::arch::asm!(
               "# VECTORIZED_START
                mov {}, 1",
               out(reg) dummy,
               options(nostack)
           );
       }
   
       for &offset in &offsets[1..] {
           intermediate.push(offset + shift)
       }
   
       // Make it easier to find the loop in the assembly
       unsafe {
           std::arch::asm!(
               "# VECTORIZED_END
                mov {}, 2",
               out(reg) dummy,
               options(nostack)
           );
       }
       std::hint::black_box(dummy);
   
       output.extend_from_slice(&intermediate);
   }
   
   #[no_mangle]
   pub fn extend_offsets_usize(output: &mut Vec<usize>, offsets: &[usize], 
next_offset: usize) {
     extend_offsets(output, offsets, next_offset);
   }
   ```
   
   And for the new code:
   ```rust
   #[inline(always)]
   fn extend_offsets<T: std::ops::Add<Output = T> + Copy + Default>(output: 
&mut Vec<T>, offsets: &[T], next_offset: T) {
       assert_ne!(offsets.len(), 0);
       let shift: T = next_offset + offsets[0];
   
       let mut intermediate = vec![T::default(); offsets.len() - 1];
   
       // Make it easier to find the loop in the assembly
       let mut dummy = 0u64;
       unsafe {
           std::arch::asm!(
               "# VECTORIZED_START
                mov {}, 1",
               out(reg) dummy,
               options(nostack)
           );
       }
   
       for (index, &offset) in offsets[1..].iter().enumerate() {
           intermediate[index] = offset + shift
       }
   
       // Make it easier to find the loop in the assembly
       unsafe {
           std::arch::asm!(
               "# VECTORIZED_END
                mov {}, 2",
               out(reg) dummy,
               options(nostack)
           );
       }
       std::hint::black_box(dummy);
   
       output.extend_from_slice(&intermediate);
   }
   
   #[no_mangle]
   pub fn extend_offsets_usize(output: &mut Vec<usize>, offsets: &[usize], 
next_offset: usize) {
     extend_offsets(output, offsets, next_offset);
   }
   ```
   
   
   </details>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[PR] perf: improve GenericByteBuilder::append_array to use SIMD for extending the offsets [arrow-rs]

Reply via email to