(arrow-rs) branch main updated: perf(parquet): Vectorize dict-index bounds check in RleDecoder::get_batch_with_dict (up to -7.9%) (#9746)

dheres Sun, 19 Apr 2026 15:09:53 -0700

This is an automated email from the ASF dual-hosted git repository.

Dandandan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new de11d9cbb5 perf(parquet): Vectorize dict-index bounds check in 
RleDecoder::get_batch_with_dict (up to -7.9%) (#9746)
de11d9cbb5 is described below

commit de11d9cbb5bd65fe420800dc32d60891bbaac980
Author: Daniël Heres <[email protected]>
AuthorDate: Mon Apr 20 00:09:40 2026 +0200

    perf(parquet): Vectorize dict-index bounds check in 
RleDecoder::get_batch_with_dict (up to -7.9%) (#9746)
    
    # Which issue does this PR close?
    
    - Close: https://github.com/apache/arrow-rs/issues/9747
    
    # Rationale for this change
    
    Rewrite the code to generate more SIMD instructions / amortize
    loop/branching overhead.
    
    # What changes are included in this PR?
    
    One-file change in `parquet/src/encodings/rle.rs`:
    
    - u32 max-reduction bounds check over `CHUNK = 16` indices
    - `#[cold] #[inline(never)] fn oob` for the panic path
    - `get_unchecked` gather after the vectorised check
    
    | type             | main             | this-pr          |     Δ |
    | ---------------- | ---------------- | ---------------- | -----:|
    | UInt64Array      |  60.6 ± 7.75 µs  |  55.8 ± 0.58 µs  | −7.9% |
    | Int64Array       |  58.2 ± 0.77 µs  |  55.3 ± 0.34 µs  | −5.0% |
    | INT32 Decimal128 |  88.5 ± 2.07 µs  |  84.3 ± 0.35 µs  | −4.7% |
    | Int32Array       |  49.6 ± 0.47 µs  |  47.4 ± 0.50 µs  | −4.4% |
    | UInt8Array       |  53.0 ± 0.42 µs  |  50.9 ± 0.60 µs  | −4.0% |
    | Int16Array       |  53.6 ± 0.49 µs  |  51.7 ± 1.38 µs  | −3.6% |
    | UInt16Array      |  53.2 ± 0.34 µs  |  51.7 ± 0.90 µs  | −2.8% |
    | UInt32Array      |  49.7 ± 4.23 µs  |  48.4 ± 1.23 µs  | −2.6% |
    | INT64 Decimal128 |  96.4 ± 2.10 µs  |  94.9 ± 1.47 µs  | −1.5% |
    | Int8Array        |  53.2 ± 0.61 µs  |  52.8 ± 5.21 µs  | −0.7% |
    
    
    # Are these changes tested?
    
    Existing tests
    
    # Are there any user-facing changes?
    
    No — no API change; decoded output is identical, panic behaviour on
    out-of-bounds indices is preserved.
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
    
    ---------
    
    Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
 parquet/src/encodings/rle.rs | 48 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs
index ea236f652a..937be1dd2c 100644
--- a/parquet/src/encodings/rle.rs
+++ b/parquet/src/encodings/rle.rs
@@ -484,7 +484,16 @@ impl RleDecoder {
             if self.rle_left > 0 {
                 let num_values = cmp::min(max_values - values_read, 
self.rle_left as usize);
                 let dict_idx = self.current_value.unwrap() as usize;
-                let dict_value = dict[dict_idx].clone();
+                let dict_value = dict
+                    .get(dict_idx)
+                    .ok_or_else(|| {
+                        general_err!(
+                            "dictionary index out of bounds: the len is {} but 
the index is {}",
+                            dict.len(),
+                            dict_idx
+                        )
+                    })?
+                    .clone();
 
                 buffer[values_read..values_read + num_values].fill(dict_value);
 
@@ -514,16 +523,30 @@ impl RleDecoder {
                         break;
                     }
                     {
+                        #[cold]
+                        #[inline(never)]
+                        fn oob(max_idx: u32, dict_len: usize) -> ParquetError {
+                            general_err!(
+                                "dictionary index out of bounds: the len is {} 
but the index is {}",
+                                dict_len,
+                                max_idx
+                            )
+                        }
+                        const CHUNK: usize = 16;
                         let out = &mut buffer[values_read..values_read + 
num_values];
                         let idx = &index_buf[..num_values];
-                        let mut out_chunks = out.chunks_exact_mut(8);
-                        let idx_chunks = idx.chunks_exact(8);
+                        let dict_len = dict.len();
+                        let mut out_chunks = out.chunks_exact_mut(CHUNK);
+                        let idx_chunks = idx.chunks_exact(CHUNK);
                         for (out_chunk, idx_chunk) in 
out_chunks.by_ref().zip(idx_chunks) {
-                            let dict_len = dict.len();
-                            assert!(
-                                idx_chunk.iter().all(|&i| (i as usize) < 
dict_len),
-                                "dictionary index out of bounds"
-                            );
+                            // u32 max-reduction instead of `.all(|&i| ..)`: 
`.all`
+                            // short-circuits and blocks autovectorisation. 
Negative
+                            // i32 cast to u32 becomes a large value so the 
bounds
+                            // check still rejects it.
+                            let max_idx = idx_chunk.iter().fold(0u32, |acc, 
&i| acc.max(i as u32));
+                            if (max_idx as usize) >= dict_len {
+                                return Err(oob(max_idx, dict_len));
+                            }
                             for (b, i) in 
out_chunk.iter_mut().zip(idx_chunk.iter()) {
                                 // SAFETY: all indices checked above to be in 
bounds
                                 b.clone_from(unsafe { dict.get_unchecked(*i as 
usize) });
@@ -532,9 +555,14 @@ impl RleDecoder {
                         for (b, i) in out_chunks
                             .into_remainder()
                             .iter_mut()
-                            .zip(idx.chunks_exact(8).remainder().iter())
+                            .zip(idx.chunks_exact(CHUNK).remainder().iter())
                         {
-                            b.clone_from(&dict[*i as usize]);
+                            let dict_idx = *i as usize;
+                            if dict_idx >= dict_len {
+                                return Err(oob(*i as u32, dict_len));
+                            }
+                            // SAFETY: bounds checked above
+                            b.clone_from(unsafe { dict.get_unchecked(dict_idx) 
});
                         }
                     }
                     self.bit_packed_left -= num_values as u32;

(arrow-rs) branch main updated: perf(parquet): Vectorize dict-index bounds check in RleDecoder::get_batch_with_dict (up to -7.9%) (#9746)

Reply via email to