This is an automated email from the ASF dual-hosted git repository.
Dandandan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new de11d9cbb5 perf(parquet): Vectorize dict-index bounds check in
RleDecoder::get_batch_with_dict (up to -7.9%) (#9746)
de11d9cbb5 is described below
commit de11d9cbb5bd65fe420800dc32d60891bbaac980
Author: Daniël Heres <[email protected]>
AuthorDate: Mon Apr 20 00:09:40 2026 +0200
perf(parquet): Vectorize dict-index bounds check in
RleDecoder::get_batch_with_dict (up to -7.9%) (#9746)
# Which issue does this PR close?
- Close: https://github.com/apache/arrow-rs/issues/9747
# Rationale for this change
Rewrite the code to generate more SIMD instructions / amortize
loop/branching overhead.
# What changes are included in this PR?
One-file change in `parquet/src/encodings/rle.rs`:
- u32 max-reduction bounds check over `CHUNK = 16` indices
- `#[cold] #[inline(never)] fn oob` for the panic path
- `get_unchecked` gather after the vectorised check
| type | main | this-pr | Δ |
| ---------------- | ---------------- | ---------------- | -----:|
| UInt64Array | 60.6 ± 7.75 µs | 55.8 ± 0.58 µs | −7.9% |
| Int64Array | 58.2 ± 0.77 µs | 55.3 ± 0.34 µs | −5.0% |
| INT32 Decimal128 | 88.5 ± 2.07 µs | 84.3 ± 0.35 µs | −4.7% |
| Int32Array | 49.6 ± 0.47 µs | 47.4 ± 0.50 µs | −4.4% |
| UInt8Array | 53.0 ± 0.42 µs | 50.9 ± 0.60 µs | −4.0% |
| Int16Array | 53.6 ± 0.49 µs | 51.7 ± 1.38 µs | −3.6% |
| UInt16Array | 53.2 ± 0.34 µs | 51.7 ± 0.90 µs | −2.8% |
| UInt32Array | 49.7 ± 4.23 µs | 48.4 ± 1.23 µs | −2.6% |
| INT64 Decimal128 | 96.4 ± 2.10 µs | 94.9 ± 1.47 µs | −1.5% |
| Int8Array | 53.2 ± 0.61 µs | 52.8 ± 5.21 µs | −0.7% |
# Are these changes tested?
Existing tests
# Are there any user-facing changes?
No — no API change; decoded output is identical, panic behaviour on
out-of-bounds indices is preserved.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
parquet/src/encodings/rle.rs | 48 +++++++++++++++++++++++++++++++++++---------
1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs
index ea236f652a..937be1dd2c 100644
--- a/parquet/src/encodings/rle.rs
+++ b/parquet/src/encodings/rle.rs
@@ -484,7 +484,16 @@ impl RleDecoder {
if self.rle_left > 0 {
let num_values = cmp::min(max_values - values_read,
self.rle_left as usize);
let dict_idx = self.current_value.unwrap() as usize;
- let dict_value = dict[dict_idx].clone();
+ let dict_value = dict
+ .get(dict_idx)
+ .ok_or_else(|| {
+ general_err!(
+ "dictionary index out of bounds: the len is {} but
the index is {}",
+ dict.len(),
+ dict_idx
+ )
+ })?
+ .clone();
buffer[values_read..values_read + num_values].fill(dict_value);
@@ -514,16 +523,30 @@ impl RleDecoder {
break;
}
{
+ #[cold]
+ #[inline(never)]
+ fn oob(max_idx: u32, dict_len: usize) -> ParquetError {
+ general_err!(
+ "dictionary index out of bounds: the len is {}
but the index is {}",
+ dict_len,
+ max_idx
+ )
+ }
+ const CHUNK: usize = 16;
let out = &mut buffer[values_read..values_read +
num_values];
let idx = &index_buf[..num_values];
- let mut out_chunks = out.chunks_exact_mut(8);
- let idx_chunks = idx.chunks_exact(8);
+ let dict_len = dict.len();
+ let mut out_chunks = out.chunks_exact_mut(CHUNK);
+ let idx_chunks = idx.chunks_exact(CHUNK);
for (out_chunk, idx_chunk) in
out_chunks.by_ref().zip(idx_chunks) {
- let dict_len = dict.len();
- assert!(
- idx_chunk.iter().all(|&i| (i as usize) <
dict_len),
- "dictionary index out of bounds"
- );
+ // u32 max-reduction instead of `.all(|&i| ..)`:
`.all`
+ // short-circuits and blocks autovectorisation.
Negative
+ // i32 cast to u32 becomes a large value so the
bounds
+ // check still rejects it.
+ let max_idx = idx_chunk.iter().fold(0u32, |acc,
&i| acc.max(i as u32));
+ if (max_idx as usize) >= dict_len {
+ return Err(oob(max_idx, dict_len));
+ }
for (b, i) in
out_chunk.iter_mut().zip(idx_chunk.iter()) {
// SAFETY: all indices checked above to be in
bounds
b.clone_from(unsafe { dict.get_unchecked(*i as
usize) });
@@ -532,9 +555,14 @@ impl RleDecoder {
for (b, i) in out_chunks
.into_remainder()
.iter_mut()
- .zip(idx.chunks_exact(8).remainder().iter())
+ .zip(idx.chunks_exact(CHUNK).remainder().iter())
{
- b.clone_from(&dict[*i as usize]);
+ let dict_idx = *i as usize;
+ if dict_idx >= dict_len {
+ return Err(oob(*i as u32, dict_len));
+ }
+ // SAFETY: bounds checked above
+ b.clone_from(unsafe { dict.get_unchecked(dict_idx)
});
}
}
self.bit_packed_left -= num_values as u32;