This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new fe3c0c92bc fix(parquet): Avoid panic on malformed thrift bool fields 
in parquet metadata (#9840)
fe3c0c92bc is described below

commit fe3c0c92bc4e3617cd82b74ca01376926ee98880
Author: BoazC-MSFT <[email protected]>
AuthorDate: Wed Apr 29 02:54:18 2026 +0300

    fix(parquet): Avoid panic on malformed thrift bool fields in parquet 
metadata (#9840)
    
    Return a Parquet error when a compact thrift field is expected to be a
    `bool` but the field header contains a non-bool wire type. This replaces
    the remaining `bool_val` unwraps in generated thrift readers and the
    hand-expanded `DataPageHeaderV2` reader.
    
    Add regression tests for malformed `DictionaryPageHeader.is_sorted` and
    `DataPageHeaderV2.is_compressed` headers to ensure corrupt input returns
    an error instead of panicking.
    
    # Which issue does this PR close?
    
    - Closes #9839.
    
    # Rationale for this change
    See #9839
    
    # What changes are included in this PR?
    Proper error handling instead of using `unwrap`.
    
    # Are these changes tested?
    Yes. Added 2 UTs.
    
    # Are there any user-facing changes?
    An error string that can find its way all the way to the end user.
---
 parquet/src/file/metadata/thrift/mod.rs | 90 +++++++++++++++++++++++++++++++--
 parquet/src/parquet_macros.rs           |  5 +-
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/parquet/src/file/metadata/thrift/mod.rs 
b/parquet/src/file/metadata/thrift/mod.rs
index 88cb96f355..ad8f6312c2 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -1128,8 +1128,13 @@ impl DataPageHeaderV2 {
                     repetition_levels_byte_length = Some(val);
                 }
                 7 => {
-                    let val = field_ident.bool_val.unwrap();
-                    is_compressed = Some(val);
+                    if field_ident.bool_val.is_none() {
+                        return Err(general_err!(
+                            "Expected bool field but got thrift type {:?}",
+                            field_ident.field_type
+                        ));
+                    }
+                    is_compressed = field_ident.bool_val;
                 }
                 _ => {
                     prot.skip(field_ident.field_type)?;
@@ -1721,13 +1726,17 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use crate::basic::Type as PhysicalType;
+    use crate::basic::{Encoding, PageType, Type as PhysicalType};
     use crate::errors::Result;
-    use crate::file::metadata::thrift::{BoundingBox, SchemaElement, 
write_schema};
+    use crate::file::metadata::thrift::{
+        BoundingBox, DataPageHeaderV2, DictionaryPageHeader, PageHeader, 
SchemaElement,
+        write_schema,
+    };
     use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, 
RowGroupMetaData};
     use crate::parquet_thrift::tests::test_roundtrip;
     use crate::parquet_thrift::{
-        ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, 
read_thrift_vec,
+        ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, 
WriteThrift,
+        read_thrift_vec,
     };
     use crate::schema::types::{
         ColumnDescriptor, ColumnPath, SchemaDescriptor, TypePtr, num_nodes,
@@ -1794,6 +1803,29 @@ pub(crate) mod tests {
         read_thrift_vec(&mut prot)
     }
 
+    fn thrift_bytes<T: WriteThrift>(value: &T) -> Vec<u8> {
+        let mut buf = Vec::new();
+        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
+        value.write_thrift(&mut writer).unwrap();
+        buf
+    }
+
+    fn change_false_bool_field_to_i32(buf: &mut [u8]) {
+        let pos = buf
+            .iter()
+            .rposition(|byte| *byte == 0x12)
+            .expect("expected BOOL_FALSE field header byte");
+        buf[pos] = 0x15;
+    }
+
+    fn assert_malformed_bool_error(err: crate::errors::ParquetError) {
+        let msg = err.to_string();
+        assert!(
+            msg.contains("Expected bool field"),
+            "unexpected error message: {msg}"
+        );
+    }
+
     #[test]
     fn test_bounding_box_roundtrip() {
         test_roundtrip(BoundingBox {
@@ -1873,4 +1905,52 @@ pub(crate) mod tests {
             .unwrap();
         assert_eq!(decoded_zero.null_count_opt(), Some(0));
     }
+
+    #[test]
+    fn malformed_bool_field_returns_error_not_panic() {
+        let page_header = PageHeader {
+            r#type: PageType::DICTIONARY_PAGE,
+            uncompressed_page_size: 1,
+            compressed_page_size: 1,
+            crc: None,
+            data_page_header: None,
+            index_page_header: None,
+            dictionary_page_header: Some(DictionaryPageHeader {
+                num_values: 1,
+                encoding: Encoding::PLAIN,
+                is_sorted: Some(false),
+            }),
+            data_page_header_v2: None,
+        };
+
+        let mut buf = thrift_bytes(&page_header);
+        change_false_bool_field_to_i32(&mut buf);
+
+        let mut prot = ThriftSliceInputProtocol::new(&buf);
+        let err = PageHeader::read_thrift_without_stats(&mut prot)
+            .expect_err("malformed bool field should return an error");
+        assert_malformed_bool_error(err);
+    }
+
+    #[test]
+    fn malformed_data_page_v2_bool_field_returns_error_not_panic() {
+        let data_page_header_v2 = DataPageHeaderV2 {
+            num_values: 1,
+            num_nulls: 0,
+            num_rows: 1,
+            encoding: Encoding::PLAIN,
+            definition_levels_byte_length: 0,
+            repetition_levels_byte_length: 0,
+            is_compressed: Some(false),
+            statistics: None,
+        };
+
+        let mut buf = thrift_bytes(&data_page_header_v2);
+        change_false_bool_field_to_i32(&mut buf);
+
+        let mut prot = ThriftSliceInputProtocol::new(&buf);
+        let err = DataPageHeaderV2::read_thrift_without_stats(&mut prot)
+            .expect_err("malformed bool field should return an error");
+        assert_malformed_bool_error(err);
+    }
 }
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 714015e10e..13c4763431 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -464,7 +464,10 @@ macro_rules! __thrift_read_field {
         $crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)?
     };
     ($prot:tt, $field_ident:tt, bool) => {
-        $field_ident.bool_val.unwrap()
+        $field_ident.bool_val.ok_or_else(|| general_err!(
+            "Expected bool field but got thrift type {:?}",
+            $field_ident.field_type
+        ))?
     };
     ($prot:tt, $field_ident:tt, $field_type:ident) => {
         $field_type::read_thrift(&mut *$prot)?

Reply via email to