This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new ba7dada819 fix(parquet): avoid panic on ColumnIndex length mismatch
(#9833)
ba7dada819 is described below
commit ba7dada81907e675beabacbe23101ff6efc342de
Author: pchintar <[email protected]>
AuthorDate: Tue Apr 28 20:00:09 2026 -0400
fix(parquet): avoid panic on ColumnIndex length mismatch (#9833)
# Which issue does this PR close?
- Closes #9832 .
# Rationale for this change
In `parquet/src/file/page_index/column_index.rs`, `ColumnIndex` decoding
assumes that page-aligned arrays (`null_pages`, `min_values`,
`max_values`, and optional arrays) have matching lengths, but this is
not validated.
As a result, malformed metadata can trigger an out-of-bounds panic
during decoding instead of returning a `ParquetError`. Since parquet
files are external input, this should be handled safely.
# What changes are included in this PR?
* Added validation in:
* `PrimitiveColumnIndex::try_new`
* `ByteArrayColumnIndex::try_new`
* Ensures:
* `min_values.len() == null_pages.len()`
* `max_values.len() == null_pages.len()`
* optional arrays (`null_counts`, histograms) are consistent with page
count
* Returns `ParquetError` on mismatch instead of panicking
# Are these changes tested?
Yes.
Added a unit test:
* `test_column_index_rejects_mismatched_min_max_lengths`
This constructs a `ColumnIndex` with mismatched lengths and verifies
that decoding returns an error instead of panicking.
# Are there any user-facing changes?
No.
---
parquet/src/file/page_index/column_index.rs | 80 +++++++++++++++++++++++++++++
1 file changed, 80 insertions(+)
diff --git a/parquet/src/file/page_index/column_index.rs
b/parquet/src/file/page_index/column_index.rs
index d005044332..9613292825 100644
--- a/parquet/src/file/page_index/column_index.rs
+++ b/parquet/src/file/page_index/column_index.rs
@@ -106,6 +106,36 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
) -> Result<Self> {
let len = null_pages.len();
+ if min_bytes.len() != len || max_bytes.len() != len {
+ return Err(ParquetError::General(format!(
+ "ColumnIndex min/max length mismatch: expected {len}, got
min={} max={}",
+ min_bytes.len(),
+ max_bytes.len()
+ )));
+ }
+ if let Some(ref nc) = null_counts {
+ if nc.len() != len {
+ return Err(ParquetError::General(format!(
+ "ColumnIndex null_counts length mismatch: expected {len},
got {}",
+ nc.len()
+ )));
+ }
+ }
+ if let Some(ref rep) = repetition_level_histograms {
+ if len != 0 && rep.len() % len != 0 {
+ return Err(ParquetError::General(
+ "Invalid repetition_level_histograms length".to_string(),
+ ));
+ }
+ }
+ if let Some(ref def) = definition_level_histograms {
+ if len != 0 && def.len() % len != 0 {
+ return Err(ParquetError::General(
+ "Invalid definition_level_histograms length".to_string(),
+ ));
+ }
+ }
+
let mut min_values = Vec::with_capacity(len);
let mut max_values = Vec::with_capacity(len);
@@ -295,6 +325,36 @@ impl ByteArrayColumnIndex {
) -> Result<Self> {
let len = null_pages.len();
+ if min_values.len() != len || max_values.len() != len {
+ return Err(ParquetError::General(format!(
+ "ColumnIndex min/max length mismatch: expected {len}, got
min={} max={}",
+ min_values.len(),
+ max_values.len()
+ )));
+ }
+ if let Some(ref nc) = null_counts {
+ if nc.len() != len {
+ return Err(ParquetError::General(format!(
+ "ColumnIndex null_counts length mismatch: expected {len},
got {}",
+ nc.len()
+ )));
+ }
+ }
+ if let Some(ref rep) = repetition_level_histograms {
+ if len != 0 && rep.len() % len != 0 {
+ return Err(ParquetError::General(
+ "Invalid repetition_level_histograms length".to_string(),
+ ));
+ }
+ }
+ if let Some(ref def) = definition_level_histograms {
+ if len != 0 && def.len() % len != 0 {
+ return Err(ParquetError::General(
+ "Invalid definition_level_histograms length".to_string(),
+ ));
+ }
+ }
+
let min_len = min_values.iter().map(|&v| v.len()).sum();
let max_len = max_values.iter().map(|&v| v.len()).sum();
let mut min_bytes = vec![0u8; min_len];
@@ -738,4 +798,24 @@ mod tests {
"Parquet error: error converting value, expected 4 bytes got 0"
);
}
+
+ #[test]
+ fn test_column_index_rejects_mismatched_min_max_lengths() {
+ // Two pages, but only one min/max entry. The entry itself is valid
i32 bytes,
+ // so this specifically checks that lengths must match the number of
pages.
+ let column_index = ThriftColumnIndex {
+ null_pages: vec![false, false],
+ min_values: vec![&[1u8, 0, 0, 0]],
+ max_values: vec![&[10u8, 0, 0, 0]],
+ null_counts: None,
+ repetition_level_histograms: None,
+ definition_level_histograms: None,
+ boundary_order: BoundaryOrder::UNORDERED,
+ };
+
+ // ColumnIndex arrays must align with the number of pages
(null_pages.len()).
+ let err =
PrimitiveColumnIndex::<i32>::try_from_thrift(column_index).unwrap_err();
+ // Should fail because min/max lengths don’t match null_pages
+ assert!(err.to_string().contains("length mismatch"));
+ }
}