(arrow-rs) branch main updated: Optimize delta binary decoder in the case where bitwidth=0 (#9477)

alamb Wed, 18 Mar 2026 11:48:01 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new c50ea6eaaf Optimize delta binary decoder in the case where bitwidth=0 
(#9477)
c50ea6eaaf is described below

commit c50ea6eaaf484620d4895896400ab0e2ced731ce
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Mar 18 11:47:48 2026 -0700

    Optimize delta binary decoder in the case where bitwidth=0 (#9477)
    
    # Which issue does this PR close?
    
    - Closes #9476.
    
    # Rationale for this change
    
    Explore if we can achieve the speedups seen in arrow-cpp
    (https://github.com/apache/arrow/pull/49296).
    
    # What changes are included in this PR?
    
    Adds special cases to the delta binary packed decoder when bitwidth for
    a miniblock is 0. The optimization avoids relying on previous values to
    decode current ones.
    
    # Are these changes tested?
    
    Yes, tests have been added, as well as new benchmarks.
    
    # Are there any user-facing changes?
    
    No
---
 parquet/src/encodings/decoding.rs | 146 +++++++++++++++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 11 deletions(-)

diff --git a/parquet/src/encodings/decoding.rs 
b/parquet/src/encodings/decoding.rs
index 58430820a9..7da21e6dd0 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -770,15 +770,48 @@ where
 
             // At this point we have read the deltas to `buffer` we now need 
to offset
             // these to get back to the original values that were encoded
-            for v in &mut buffer[read..read + batch_read] {
+            //
+            // Optimization: if the bit_width for the miniblock is 0, then we 
can employ
+            // a faster decoding method than setting `value[i] = value[i-1] + 
value[i] + min_delta`.
+            // Where min_delta is 0 (all values in the miniblock are the 
same), we can simply
+            // set all values to `self.last_value`. In the case of non-zero 
min_delta (values
+            // in the mini-block form an arithmetic progression) each value 
can be computed via
+            // `value[i] = (i + 1) * min_delta + last_value`. In both cases we 
remove the
+            // dependence on the preceding value.
+            // Kudos to @pitrou for the idea 
https://github.com/apache/arrow/pull/49296
+            let min_delta = self.min_delta.as_i64()?;
+            if bit_width == 0 {
+                if min_delta == 0 {
+                    buffer[read..read + batch_read].fill(self.last_value);
+                } else {
+                    // the c++ version multiplies min_delta by the iter index, 
but doing
+                    // wrapping_mul through T::T was a bit slower. this is 
still
+                    // faster than before.
+                    let mut delta = self.min_delta;
+                    for v in &mut buffer[read..read + batch_read] {
+                        *v = self.last_value.wrapping_add(&delta);
+                        delta = delta.wrapping_add(&self.min_delta);
+                    }
+
+                    self.last_value = buffer[read + batch_read - 1];
+                }
+            } else {
                 // It is OK for deltas to contain "overflowed" values after 
encoding,
                 // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to 
"overflow" again and
                 // restore original value.
-                *v = v
-                    .wrapping_add(&self.min_delta)
-                    .wrapping_add(&self.last_value);
-
-                self.last_value = *v;
+                if min_delta == 0 {
+                    for v in &mut buffer[read..read + batch_read] {
+                        *v = v.wrapping_add(&self.last_value);
+                        self.last_value = *v;
+                    }
+                } else {
+                    for v in &mut buffer[read..read + batch_read] {
+                        *v = v
+                            .wrapping_add(&self.min_delta)
+                            .wrapping_add(&self.last_value);
+                        self.last_value = *v;
+                    }
+                }
             }
 
             read += batch_read;
@@ -840,12 +873,33 @@ where
                 ));
             }
 
-            for v in &mut skip_buffer[0..skip_count] {
-                *v = v
-                    .wrapping_add(&self.min_delta)
-                    .wrapping_add(&self.last_value);
+            // see commentary in self.get() above regarding optimizations
+            let min_delta = self.min_delta.as_i64()?;
+            if bit_width == 0 {
+                // if min_delta == 0, there's nothing to do. self.last_value 
is unchanged
+                if min_delta != 0 {
+                    let mut delta = self.min_delta;
+                    for v in &mut skip_buffer[0..skip_count] {
+                        *v = self.last_value.wrapping_add(&delta);
+                        delta = delta.wrapping_add(&self.min_delta);
+                    }
+
+                    self.last_value = skip_buffer[skip_count - 1];
+                }
+            } else if min_delta == 0 {
+                for v in &mut skip_buffer[0..skip_count] {
+                    *v = v.wrapping_add(&self.last_value);
+
+                    self.last_value = *v;
+                }
+            } else {
+                for v in &mut skip_buffer[0..skip_count] {
+                    *v = v
+                        .wrapping_add(&self.min_delta)
+                        .wrapping_add(&self.last_value);
 
-                self.last_value = *v;
+                    self.last_value = *v;
+                }
             }
 
             skip += mini_block_should_skip;
@@ -1802,6 +1856,76 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_delta_bit_packed_int32_single_value_large() {
+        let block_data = vec![3; 10240];
+        test_delta_bit_packed_decode::<Int32Type>(vec![block_data]);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int32_single_value_skip_large() {
+        let block_data = vec![3; 10240];
+        test_skip::<Int32Type>(block_data.clone(), 
Encoding::DELTA_BINARY_PACKED, 50);
+        test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED, 
5000);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int32_increasing_value_large() {
+        let block_data = (0i32..10240).collect();
+        test_delta_bit_packed_decode::<Int32Type>(vec![block_data]);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int32_increasing_value_skip_large() {
+        let block_data = (0i32..10240).collect::<Vec<i32>>();
+        test_skip::<Int32Type>(block_data.clone(), 
Encoding::DELTA_BINARY_PACKED, 50);
+        test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED, 
5000);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int32_stepped_value_large() {
+        let block_data = (0i32..10240).map(|i| i / 2).collect();
+        test_delta_bit_packed_decode::<Int32Type>(vec![block_data]);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int32_stepped_value_skip_large() {
+        let block_data = (0i32..10240).map(|i| i / 2).collect::<Vec<i32>>();
+        test_skip::<Int32Type>(block_data.clone(), 
Encoding::DELTA_BINARY_PACKED, 50);
+        test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED, 
5000);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int32_mixed_large() {
+        // should be enough for 4 mini-blocks plus a little so we get some
+        // mixed mini-blocks
+        const BLOCK_SIZE: i32 = 133;
+        let block1_data = (0..BLOCK_SIZE).map(|i| (i * 7) % 11).collect();
+        let block2_data = vec![3; BLOCK_SIZE as usize];
+        let block3_data = (0..BLOCK_SIZE).map(|i| (i * 5) % 13).collect();
+        let block4_data = (0..BLOCK_SIZE).collect();
+        let block5_data = (0..BLOCK_SIZE).map(|i| (i * 3) % 17).collect();
+        test_delta_bit_packed_decode::<Int32Type>(vec![
+            block1_data,
+            block2_data,
+            block3_data,
+            block4_data,
+            block5_data,
+        ]);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int64_single_value_large() {
+        let block_data = vec![5; 10240];
+        test_delta_bit_packed_decode::<Int64Type>(vec![block_data]);
+    }
+
+    #[test]
+    fn test_delta_bit_packed_int64_increasing_value_large() {
+        let block_data = (0i64..10240).collect();
+        test_delta_bit_packed_decode::<Int64Type>(vec![block_data]);
+    }
+
     #[test]
     fn test_delta_byte_array_same_arrays() {
         let data = vec![

(arrow-rs) branch main updated: Optimize delta binary decoder in the case where bitwidth=0 (#9477)

Reply via email to