This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0dda129be7 Return an error instead of a panic when reading a corrupted
Parquet file with mismatched column counts (#5362)
0dda129be7 is described below
commit 0dda129be715a95981f0151c8378f0b0b37a4ae9
Author: Matthieu Maitre <[email protected]>
AuthorDate: Sun Feb 4 14:43:43 2024 -0800
Return an error instead of a panic when reading a corrupted Parquet file
with mismatched column counts (#5362)
* Return an error instead of a panic when reading a corrupted Parquet file
with mismatched column counts
* Update parquet/src/file/metadata.rs
Co-authored-by: Jeffrey Vo <[email protected]>
* Fix test
---------
Co-authored-by: Matthieu Maitre <[email protected]>
Co-authored-by: Jeffrey Vo <[email protected]>
---
parquet/src/file/metadata.rs | 77 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 76 insertions(+), 1 deletion(-)
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index a1f3c87d0a..acd3a9f938 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -349,7 +349,13 @@ impl RowGroupMetaData {
/// Method to convert from Thrift.
pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) ->
Result<RowGroupMetaData> {
- assert_eq!(schema_descr.num_columns(), rg.columns.len());
+ if schema_descr.num_columns() != rg.columns.len() {
+ return Err(general_err!(
+ "Column count mismatch. Schema has {} columns while Row Group
has {}",
+ schema_descr.num_columns(),
+ rg.columns.len()
+ ));
+ }
let total_byte_size = rg.total_byte_size;
let num_rows = rg.num_rows;
let mut columns = vec![];
@@ -1039,6 +1045,75 @@ mod tests {
}
}
+ /// Test reading a corrupted Parquet file with 3 columns in its schema but
only 2 in its row group
+ #[test]
+ fn test_row_group_metadata_thrift_corrupted() {
+ let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
+ SchemaType::group_type_builder("schema")
+ .with_fields(vec![
+ Arc::new(
+ SchemaType::primitive_type_builder("a", Type::INT32)
+ .build()
+ .unwrap(),
+ ),
+ Arc::new(
+ SchemaType::primitive_type_builder("b", Type::INT32)
+ .build()
+ .unwrap(),
+ ),
+ ])
+ .build()
+ .unwrap(),
+ )));
+
+ let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
+ SchemaType::group_type_builder("schema")
+ .with_fields(vec![
+ Arc::new(
+ SchemaType::primitive_type_builder("a", Type::INT32)
+ .build()
+ .unwrap(),
+ ),
+ Arc::new(
+ SchemaType::primitive_type_builder("b", Type::INT32)
+ .build()
+ .unwrap(),
+ ),
+ Arc::new(
+ SchemaType::primitive_type_builder("c", Type::INT32)
+ .build()
+ .unwrap(),
+ ),
+ ])
+ .build()
+ .unwrap(),
+ )));
+
+ let row_group_meta_2cols =
RowGroupMetaData::builder(schema_descr_2cols.clone())
+ .set_num_rows(1000)
+ .set_total_byte_size(2000)
+ .set_column_metadata(vec![
+ ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
+ .build()
+ .unwrap(),
+ ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
+ .build()
+ .unwrap(),
+ ])
+ .set_ordinal(1)
+ .build()
+ .unwrap();
+
+ let err =
+ RowGroupMetaData::from_thrift(schema_descr_3cols,
row_group_meta_2cols.to_thrift())
+ .unwrap_err()
+ .to_string();
+ assert_eq!(
+ err,
+ "Parquet error: Column count mismatch. Schema has 3 columns while
Row Group has 2"
+ );
+ }
+
#[test]
fn test_column_chunk_metadata_thrift_conversion() {
let column_descr = get_test_schema_descr().column(0);