This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d3cad6e0a4 [Parquet] Do not panic when trying to skip records in delta 
encoded files using non-standard block sizes (#9794)
d3cad6e0a4 is described below

commit d3cad6e0a402aeddbdeb67e9f276d9d697b69860
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 1 15:03:20 2026 -0700

    [Parquet] Do not panic when trying to skip records in delta encoded files 
using non-standard block sizes (#9794)
    
    # Which issue does this PR close?
    
    - Closes #9793.
    
    # Rationale for this change
    `DeltaBitPackDecoder::skip` uses some magic numbers when sizing the
    buffer used for skipping. Files that use non-standard miniblock sizes
    will cause `skip` to panic.
    
    # What changes are included in this PR?
    
    Check for non-standard miniblock sizes and return an error rather than a
    panic.
    
    Note: allocating a vec sized with `values_per_mini_block` resulted in a
    significant performance regression.
    
    # Are these changes tested?
    
    Yes, test with file with non-standard sizes is added to the arrow_reader
    tests.
    
    # Are there any user-facing changes?
    
    No.
---
 parquet/src/encodings/decoding.rs           |  22 +++++++++++++++++----
 parquet/tests/arrow_reader/bad_data.rs      |  29 +++++++++++++++++++++++++++-
 parquet/tests/arrow_reader/bigdelta.parquet | Bin 0 -> 3660 bytes
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/parquet/src/encodings/decoding.rs 
b/parquet/src/encodings/decoding.rs
index da07d23eb6..0fb9604122 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -847,10 +847,21 @@ where
             self.values_left -= 1;
         }
 
-        let mini_block_batch_size = match T::T::PHYSICAL_TYPE {
-            Type::INT32 => 32,
-            Type::INT64 => 64,
-            _ => unreachable!(),
+        // See https://github.com/apache/arrow-rs/pull/9794.
+        // The parquet spec actually allows for miniblock sizes other than 32 
or 64, but
+        // no current writers use anything else. Using values_per_mini_block 
directly
+        // for the skip_buffer doesn't allow stack allocation and leads to a 
significant
+        // drop in performance. We'll settle for erroring out here and come up 
with a
+        // better fix if writers ever start getting creative with block sizes.
+        let mini_block_batch_size = match self.values_per_mini_block {
+            32 => 32,
+            64 => 64,
+            _ => {
+                return Err(general_err!(
+                    "cannot skip miniblock of size {}",
+                    self.values_per_mini_block
+                ));
+            }
         };
 
         let mut skip_buffer = vec![T::T::default(); mini_block_batch_size];
@@ -863,10 +874,13 @@ where
             self.check_bit_width(bit_width)?;
             let mini_block_to_skip = self.mini_block_remaining.min(to_skip - 
skip);
 
+            // see commentary in self.get() above regarding optimizations
             let min_delta = self.min_delta.as_i64()?;
             if bit_width == 0 {
                 // All remainders are zero: every delta equals min_delta 
exactly.
                 // Advance last_value by n * min_delta with no bit reads.
+                // When min_delta == 0 there is nothing to do: last_value is
+                // unchanged and no bytes are consumed from the bit reader.
                 if min_delta != 0 {
                     let total = min_delta.wrapping_mul(mini_block_to_skip as 
i64);
                     let step = T::T::from_i64(total)
diff --git a/parquet/tests/arrow_reader/bad_data.rs 
b/parquet/tests/arrow_reader/bad_data.rs
index 54c92976e4..d9b0d89e2c 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -18,6 +18,7 @@
 //! Tests that reading invalid parquet files returns an error
 
 use arrow::util::test_util::parquet_test_data;
+use bytes::Bytes;
 use parquet::arrow::arrow_reader::ArrowReaderBuilder;
 use parquet::errors::ParquetError;
 use std::collections::HashSet;
@@ -147,11 +148,37 @@ fn read_file(name: &str) -> Result<usize, ParquetError> {
     Ok(num_rows)
 }
 
+#[test]
+fn non_standard_delta_blocks() {
+    let file = Bytes::from_static(include_bytes!("bigdelta.parquet"));
+    use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+
+    let selectors = vec![RowSelector::skip(1000), RowSelector::select(5)];
+
+    let selection: RowSelection = selectors.into();
+    let reader = ArrowReaderBuilder::try_new(file)
+        .unwrap()
+        .with_row_selection(selection)
+        .build()
+        .unwrap();
+
+    if let Some(maybe_batch) = reader.into_iter().next() {
+        // TODO: uncomment if we ever allow skipping miniblocks > 64 elements
+        //let batch = maybe_batch.expect("skip should succeed");
+        //assert_eq!(batch.num_rows(), 5);
+        assert!(
+            maybe_batch
+                .unwrap_err()
+                .to_string()
+                .contains("cannot skip miniblock of size 128")
+        );
+    }
+}
+
 #[cfg(feature = "async")]
 #[tokio::test]
 #[allow(deprecated)]
 async fn bad_metadata_err() {
-    use bytes::Bytes;
     use parquet::file::metadata::ParquetMetaDataReader;
 
     let metadata_buffer = 
Bytes::from_static(include_bytes!("bad_raw_metadata.bin"));
diff --git a/parquet/tests/arrow_reader/bigdelta.parquet 
b/parquet/tests/arrow_reader/bigdelta.parquet
new file mode 100644
index 0000000000..0fb270af8f
Binary files /dev/null and b/parquet/tests/arrow_reader/bigdelta.parquet differ

Reply via email to