This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new ba7dada819 fix(parquet): avoid panic on ColumnIndex length mismatch 
(#9833)
ba7dada819 is described below

commit ba7dada81907e675beabacbe23101ff6efc342de
Author: pchintar <[email protected]>
AuthorDate: Tue Apr 28 20:00:09 2026 -0400

    fix(parquet): avoid panic on ColumnIndex length mismatch (#9833)
    
    # Which issue does this PR close?
    
    - Closes #9832 .
    
    # Rationale for this change
    
    In `parquet/src/file/page_index/column_index.rs`, `ColumnIndex` decoding
    assumes that page-aligned arrays (`null_pages`, `min_values`,
    `max_values`, and optional arrays) have matching lengths, but this is
    not validated.
    
    As a result, malformed metadata can trigger an out-of-bounds panic
    during decoding instead of returning a `ParquetError`. Since parquet
    files are external input, this should be handled safely.
    
    # What changes are included in this PR?
    
    * Added validation in:
    
      * `PrimitiveColumnIndex::try_new`
      * `ByteArrayColumnIndex::try_new`
    
    * Ensures:
    
      * `min_values.len() == null_pages.len()`
      * `max_values.len() == null_pages.len()`
    * optional arrays (`null_counts`, histograms) are consistent with page
    count
    
    * Returns `ParquetError` on mismatch instead of panicking
    
    # Are these changes tested?
    
    Yes.
    
    Added a unit test:
    
    * `test_column_index_rejects_mismatched_min_max_lengths`
    
    This constructs a `ColumnIndex` with mismatched lengths and verifies
    that decoding returns an error instead of panicking.
    
    # Are there any user-facing changes?
    
    No.
---
 parquet/src/file/page_index/column_index.rs | 80 +++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/parquet/src/file/page_index/column_index.rs 
b/parquet/src/file/page_index/column_index.rs
index d005044332..9613292825 100644
--- a/parquet/src/file/page_index/column_index.rs
+++ b/parquet/src/file/page_index/column_index.rs
@@ -106,6 +106,36 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
     ) -> Result<Self> {
         let len = null_pages.len();
 
+        if min_bytes.len() != len || max_bytes.len() != len {
+            return Err(ParquetError::General(format!(
+                "ColumnIndex min/max length mismatch: expected {len}, got 
min={} max={}",
+                min_bytes.len(),
+                max_bytes.len()
+            )));
+        }
+        if let Some(ref nc) = null_counts {
+            if nc.len() != len {
+                return Err(ParquetError::General(format!(
+                    "ColumnIndex null_counts length mismatch: expected {len}, 
got {}",
+                    nc.len()
+                )));
+            }
+        }
+        if let Some(ref rep) = repetition_level_histograms {
+            if len != 0 && rep.len() % len != 0 {
+                return Err(ParquetError::General(
+                    "Invalid repetition_level_histograms length".to_string(),
+                ));
+            }
+        }
+        if let Some(ref def) = definition_level_histograms {
+            if len != 0 && def.len() % len != 0 {
+                return Err(ParquetError::General(
+                    "Invalid definition_level_histograms length".to_string(),
+                ));
+            }
+        }
+
         let mut min_values = Vec::with_capacity(len);
         let mut max_values = Vec::with_capacity(len);
 
@@ -295,6 +325,36 @@ impl ByteArrayColumnIndex {
     ) -> Result<Self> {
         let len = null_pages.len();
 
+        if min_values.len() != len || max_values.len() != len {
+            return Err(ParquetError::General(format!(
+                "ColumnIndex min/max length mismatch: expected {len}, got 
min={} max={}",
+                min_values.len(),
+                max_values.len()
+            )));
+        }
+        if let Some(ref nc) = null_counts {
+            if nc.len() != len {
+                return Err(ParquetError::General(format!(
+                    "ColumnIndex null_counts length mismatch: expected {len}, 
got {}",
+                    nc.len()
+                )));
+            }
+        }
+        if let Some(ref rep) = repetition_level_histograms {
+            if len != 0 && rep.len() % len != 0 {
+                return Err(ParquetError::General(
+                    "Invalid repetition_level_histograms length".to_string(),
+                ));
+            }
+        }
+        if let Some(ref def) = definition_level_histograms {
+            if len != 0 && def.len() % len != 0 {
+                return Err(ParquetError::General(
+                    "Invalid definition_level_histograms length".to_string(),
+                ));
+            }
+        }
+
         let min_len = min_values.iter().map(|&v| v.len()).sum();
         let max_len = max_values.iter().map(|&v| v.len()).sum();
         let mut min_bytes = vec![0u8; min_len];
@@ -738,4 +798,24 @@ mod tests {
             "Parquet error: error converting value, expected 4 bytes got 0"
         );
     }
+
+    #[test]
+    fn test_column_index_rejects_mismatched_min_max_lengths() {
+        // Two pages, but only one min/max entry. The entry itself is valid 
i32 bytes,
+        // so this specifically checks that lengths must match the number of 
pages.
+        let column_index = ThriftColumnIndex {
+            null_pages: vec![false, false],
+            min_values: vec![&[1u8, 0, 0, 0]],
+            max_values: vec![&[10u8, 0, 0, 0]],
+            null_counts: None,
+            repetition_level_histograms: None,
+            definition_level_histograms: None,
+            boundary_order: BoundaryOrder::UNORDERED,
+        };
+
+        // ColumnIndex arrays must align with the number of pages 
(null_pages.len()).
+        let err = 
PrimitiveColumnIndex::<i32>::try_from_thrift(column_index).unwrap_err();
+        // Should fail because min/max lengths don’t match null_pages
+        assert!(err.to_string().contains("length mismatch"));
+    }
 }

Reply via email to