This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new c50ea6eaaf Optimize delta binary decoder in the case where bitwidth=0
(#9477)
c50ea6eaaf is described below
commit c50ea6eaaf484620d4895896400ab0e2ced731ce
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Mar 18 11:47:48 2026 -0700
Optimize delta binary decoder in the case where bitwidth=0 (#9477)
# Which issue does this PR close?
- Closes #9476.
# Rationale for this change
Explore if we can achieve the speedups seen in arrow-cpp
(https://github.com/apache/arrow/pull/49296).
# What changes are included in this PR?
Adds special cases to the delta binary packed decoder when bitwidth for
a miniblock is 0. The optimization avoids relying on previous values to
decode current ones.
# Are these changes tested?
Yes, tests have been added, as well as new benchmarks.
# Are there any user-facing changes?
No
---
parquet/src/encodings/decoding.rs | 146 +++++++++++++++++++++++++++++++++++---
1 file changed, 135 insertions(+), 11 deletions(-)
diff --git a/parquet/src/encodings/decoding.rs
b/parquet/src/encodings/decoding.rs
index 58430820a9..7da21e6dd0 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -770,15 +770,48 @@ where
// At this point we have read the deltas to `buffer` we now need
to offset
// these to get back to the original values that were encoded
- for v in &mut buffer[read..read + batch_read] {
+ //
+ // Optimization: if the bit_width for the miniblock is 0, then we
can employ
+ // a faster decoding method than setting `value[i] = value[i-1] +
value[i] + min_delta`.
+ // Where min_delta is 0 (all values in the miniblock are the
same), we can simply
+ // set all values to `self.last_value`. In the case of non-zero
min_delta (values
+ // in the mini-block form an arithmetic progression) each value
can be computed via
+ // `value[i] = (i + 1) * min_delta + last_value`. In both cases we
remove the
+ // dependence on the preceding value.
+ // Kudos to @pitrou for the idea
https://github.com/apache/arrow/pull/49296
+ let min_delta = self.min_delta.as_i64()?;
+ if bit_width == 0 {
+ if min_delta == 0 {
+ buffer[read..read + batch_read].fill(self.last_value);
+ } else {
+ // the c++ version multiplies min_delta by the iter index,
but doing
+ // wrapping_mul through T::T was a bit slower. this is
still
+ // faster than before.
+ let mut delta = self.min_delta;
+ for v in &mut buffer[read..read + batch_read] {
+ *v = self.last_value.wrapping_add(&delta);
+ delta = delta.wrapping_add(&self.min_delta);
+ }
+
+ self.last_value = buffer[read + batch_read - 1];
+ }
+ } else {
// It is OK for deltas to contain "overflowed" values after
encoding,
// e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to
"overflow" again and
// restore original value.
- *v = v
- .wrapping_add(&self.min_delta)
- .wrapping_add(&self.last_value);
-
- self.last_value = *v;
+ if min_delta == 0 {
+ for v in &mut buffer[read..read + batch_read] {
+ *v = v.wrapping_add(&self.last_value);
+ self.last_value = *v;
+ }
+ } else {
+ for v in &mut buffer[read..read + batch_read] {
+ *v = v
+ .wrapping_add(&self.min_delta)
+ .wrapping_add(&self.last_value);
+ self.last_value = *v;
+ }
+ }
}
read += batch_read;
@@ -840,12 +873,33 @@ where
));
}
- for v in &mut skip_buffer[0..skip_count] {
- *v = v
- .wrapping_add(&self.min_delta)
- .wrapping_add(&self.last_value);
+ // see commentary in self.get() above regarding optimizations
+ let min_delta = self.min_delta.as_i64()?;
+ if bit_width == 0 {
+ // if min_delta == 0, there's nothing to do. self.last_value
is unchanged
+ if min_delta != 0 {
+ let mut delta = self.min_delta;
+ for v in &mut skip_buffer[0..skip_count] {
+ *v = self.last_value.wrapping_add(&delta);
+ delta = delta.wrapping_add(&self.min_delta);
+ }
+
+ self.last_value = skip_buffer[skip_count - 1];
+ }
+ } else if min_delta == 0 {
+ for v in &mut skip_buffer[0..skip_count] {
+ *v = v.wrapping_add(&self.last_value);
+
+ self.last_value = *v;
+ }
+ } else {
+ for v in &mut skip_buffer[0..skip_count] {
+ *v = v
+ .wrapping_add(&self.min_delta)
+ .wrapping_add(&self.last_value);
- self.last_value = *v;
+ self.last_value = *v;
+ }
}
skip += mini_block_should_skip;
@@ -1802,6 +1856,76 @@ mod tests {
);
}
+ #[test]
+ fn test_delta_bit_packed_int32_single_value_large() {
+ let block_data = vec![3; 10240];
+ test_delta_bit_packed_decode::<Int32Type>(vec![block_data]);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int32_single_value_skip_large() {
+ let block_data = vec![3; 10240];
+ test_skip::<Int32Type>(block_data.clone(),
Encoding::DELTA_BINARY_PACKED, 50);
+ test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED,
5000);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int32_increasing_value_large() {
+ let block_data = (0i32..10240).collect();
+ test_delta_bit_packed_decode::<Int32Type>(vec![block_data]);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int32_increasing_value_skip_large() {
+ let block_data = (0i32..10240).collect::<Vec<i32>>();
+ test_skip::<Int32Type>(block_data.clone(),
Encoding::DELTA_BINARY_PACKED, 50);
+ test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED,
5000);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int32_stepped_value_large() {
+ let block_data = (0i32..10240).map(|i| i / 2).collect();
+ test_delta_bit_packed_decode::<Int32Type>(vec![block_data]);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int32_stepped_value_skip_large() {
+ let block_data = (0i32..10240).map(|i| i / 2).collect::<Vec<i32>>();
+ test_skip::<Int32Type>(block_data.clone(),
Encoding::DELTA_BINARY_PACKED, 50);
+ test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED,
5000);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int32_mixed_large() {
+ // should be enough for 4 mini-blocks plus a little so we get some
+ // mixed mini-blocks
+ const BLOCK_SIZE: i32 = 133;
+ let block1_data = (0..BLOCK_SIZE).map(|i| (i * 7) % 11).collect();
+ let block2_data = vec![3; BLOCK_SIZE as usize];
+ let block3_data = (0..BLOCK_SIZE).map(|i| (i * 5) % 13).collect();
+ let block4_data = (0..BLOCK_SIZE).collect();
+ let block5_data = (0..BLOCK_SIZE).map(|i| (i * 3) % 17).collect();
+ test_delta_bit_packed_decode::<Int32Type>(vec![
+ block1_data,
+ block2_data,
+ block3_data,
+ block4_data,
+ block5_data,
+ ]);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int64_single_value_large() {
+ let block_data = vec![5; 10240];
+ test_delta_bit_packed_decode::<Int64Type>(vec![block_data]);
+ }
+
+ #[test]
+ fn test_delta_bit_packed_int64_increasing_value_large() {
+ let block_data = (0i64..10240).collect();
+ test_delta_bit_packed_decode::<Int64Type>(vec![block_data]);
+ }
+
#[test]
fn test_delta_byte_array_same_arrays() {
let data = vec![