This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 0dda129be7 Return an error instead of a panic when reading a corrupted 
Parquet file with mismatched column counts (#5362)
0dda129be7 is described below

commit 0dda129be715a95981f0151c8378f0b0b37a4ae9
Author: Matthieu Maitre <[email protected]>
AuthorDate: Sun Feb 4 14:43:43 2024 -0800

    Return an error instead of a panic when reading a corrupted Parquet file 
with mismatched column counts (#5362)
    
    * Return an error instead of a panic when reading a corrupted Parquet file 
with mismatched column counts
    
    * Update parquet/src/file/metadata.rs
    
    Co-authored-by: Jeffrey Vo <[email protected]>
    
    * Fix test
    
    ---------
    
    Co-authored-by: Matthieu Maitre <[email protected]>
    Co-authored-by: Jeffrey Vo <[email protected]>
---
 parquet/src/file/metadata.rs | 77 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index a1f3c87d0a..acd3a9f938 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -349,7 +349,13 @@ impl RowGroupMetaData {
 
     /// Method to convert from Thrift.
     pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> 
Result<RowGroupMetaData> {
-        assert_eq!(schema_descr.num_columns(), rg.columns.len());
+        if schema_descr.num_columns() != rg.columns.len() {
+            return Err(general_err!(
+                "Column count mismatch. Schema has {} columns while Row Group 
has {}",
+                schema_descr.num_columns(),
+                rg.columns.len()
+            ));
+        }
         let total_byte_size = rg.total_byte_size;
         let num_rows = rg.num_rows;
         let mut columns = vec![];
@@ -1039,6 +1045,75 @@ mod tests {
         }
     }
 
+    /// Test reading a corrupted Parquet file with 3 columns in its schema but 
only 2 in its row group
+    #[test]
+    fn test_row_group_metadata_thrift_corrupted() {
+        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
+            SchemaType::group_type_builder("schema")
+                .with_fields(vec![
+                    Arc::new(
+                        SchemaType::primitive_type_builder("a", Type::INT32)
+                            .build()
+                            .unwrap(),
+                    ),
+                    Arc::new(
+                        SchemaType::primitive_type_builder("b", Type::INT32)
+                            .build()
+                            .unwrap(),
+                    ),
+                ])
+                .build()
+                .unwrap(),
+        )));
+
+        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
+            SchemaType::group_type_builder("schema")
+                .with_fields(vec![
+                    Arc::new(
+                        SchemaType::primitive_type_builder("a", Type::INT32)
+                            .build()
+                            .unwrap(),
+                    ),
+                    Arc::new(
+                        SchemaType::primitive_type_builder("b", Type::INT32)
+                            .build()
+                            .unwrap(),
+                    ),
+                    Arc::new(
+                        SchemaType::primitive_type_builder("c", Type::INT32)
+                            .build()
+                            .unwrap(),
+                    ),
+                ])
+                .build()
+                .unwrap(),
+        )));
+
+        let row_group_meta_2cols = 
RowGroupMetaData::builder(schema_descr_2cols.clone())
+            .set_num_rows(1000)
+            .set_total_byte_size(2000)
+            .set_column_metadata(vec![
+                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
+                    .build()
+                    .unwrap(),
+                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
+                    .build()
+                    .unwrap(),
+            ])
+            .set_ordinal(1)
+            .build()
+            .unwrap();
+
+        let err =
+            RowGroupMetaData::from_thrift(schema_descr_3cols, 
row_group_meta_2cols.to_thrift())
+                .unwrap_err()
+                .to_string();
+        assert_eq!(
+            err,
+            "Parquet error: Column count mismatch. Schema has 3 columns while 
Row Group has 2"
+        );
+    }
+
     #[test]
     fn test_column_chunk_metadata_thrift_conversion() {
         let column_descr = get_test_schema_descr().column(0);

Reply via email to