This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 6ce4bc8996 Validate encoded Thrift lists match the schema (#9924)
6ce4bc8996 is described below

commit 6ce4bc899644c1b0816072f3260d835a0c14d148
Author: Ed Seidl <[email protected]>
AuthorDate: Thu May 7 11:24:08 2026 -0700

    Validate encoded Thrift lists match the schema (#9924)
    
    # Which issue does this PR close?
    
    - Part of #9923.
    
    # Rationale for this change
    
    A first attempt at adding some validation. This will check that the
    encoded list element type matches what is expected from the Parquet
    schema.
    
    # What changes are included in this PR?
    Adds an `ELEMENT_TYPE` to the `ReadThrift` trait for use in validating
    data types in `read_thrift_vec`.
    
    # Are these changes tested?
    
    Should be covered by existing. These changes also cause an earlier error
    detection in an existing test of malformed data.
    
    # Are there any user-facing changes?
    
    No, just improves error handling
---
 parquet/src/basic.rs                        |  4 +++-
 parquet/src/file/metadata/thrift/mod.rs     |  9 ++++++++-
 parquet/src/file/page_index/offset_index.rs |  3 ++-
 parquet/src/file/serialized_reader.rs       |  2 +-
 parquet/src/parquet_thrift.rs               | 30 ++++++++++++++++++++++++++++-
 parquet/tests/arrow_reader/bad_data.rs      |  2 +-
 6 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index cc7a16eca0..17e5b9c321 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -28,7 +28,7 @@ pub use crate::compression::{BrotliLevel, GzipLevel, 
ZstdLevel};
 use crate::file::metadata::HeapSize;
 use crate::parquet_thrift::{
     ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, 
ThriftCompactOutputProtocol,
-    WriteThrift, WriteThriftField,
+    WriteThrift, WriteThriftField, validate_list_type,
 };
 use crate::{thrift_enum, thrift_struct, thrift_union_all_empty, 
write_thrift_field};
 
@@ -771,6 +771,8 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, 
R> for EncodingMask {
 
         // This reads a Thrift `list<Encoding>` and turns it into a bitmask
         let list_ident = prot.read_list_begin()?;
+        // check for enum (encoded as I32)
+        validate_list_type(ElementType::I32, &list_ident)?;
         for _ in 0..list_ident.size {
             let val = Encoding::read_thrift(prot)?;
             mask |= 1 << val as i32;
diff --git a/parquet/src/file/metadata/thrift/mod.rs 
b/parquet/src/file/metadata/thrift/mod.rs
index b68af0c485..75fcad871e 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -51,7 +51,7 @@ use crate::{
     parquet_thrift::{
         ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
         ThriftCompactOutputProtocol, ThriftSliceInputProtocol, WriteThrift, 
WriteThriftField,
-        read_thrift_vec,
+        read_thrift_vec, validate_list_type,
     },
     schema::types::{
         ColumnDescriptor, SchemaDescriptor, TypePtr, num_nodes, 
parquet_schema_from_array,
@@ -387,6 +387,9 @@ fn read_encoding_stats_as_mask<'a>(
     // read the vector of stats, setting mask bits for data pages
     let mut mask = 0i32;
     let list_ident = prot.read_list_begin()?;
+    // check for PageEncodingStats struct
+    validate_list_type(ElementType::Struct, &list_ident)?;
+
     for _ in 0..list_ident.size {
         let pes = PageEncodingStats::read_thrift(prot)?;
         match pes.page_type {
@@ -652,6 +655,8 @@ fn read_row_group(
         match field_ident.id {
             1 => {
                 let list_ident = prot.read_list_begin()?;
+                // check for list of struct
+                validate_list_type(ElementType::Struct, &list_ident)?;
                 if schema_descr.num_columns() != list_ident.size as usize {
                     return Err(general_err!(
                         "Column count mismatch. Schema has {} columns while 
Row Group has {}",
@@ -801,6 +806,8 @@ pub(crate) fn parquet_metadata_from_bytes(
                 }
                 let schema_descr = schema_descr.as_ref().unwrap();
                 let list_ident = prot.read_list_begin()?;
+                // check for list of struct
+                validate_list_type(ElementType::Struct, &list_ident)?;
                 let mut rg_vec = Vec::with_capacity(list_ident.size as usize);
 
                 // Read row groups and handle ordinal assignment
diff --git a/parquet/src/file/page_index/offset_index.rs 
b/parquet/src/file/page_index/offset_index.rs
index b1e30dd459..06d21efb68 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -23,7 +23,7 @@ use std::io::Write;
 
 use crate::parquet_thrift::{
     ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, 
ThriftCompactOutputProtocol,
-    WriteThrift, WriteThriftField, read_thrift_vec,
+    WriteThrift, WriteThriftField, read_thrift_vec, validate_list_type,
 };
 use crate::{
     errors::{ParquetError, Result},
@@ -90,6 +90,7 @@ impl OffsetIndexMetaData {
 
         // we have to do this manually because we want to use the fast 
PageLocation decoder
         let list_ident = prot.read_list_begin()?;
+        validate_list_type(ElementType::Struct, &list_ident)?;
         let mut page_locations = Vec::with_capacity(list_ident.size as usize);
         for _ in 0..list_ident.size {
             page_locations.push(read_page_location(prot)?);
diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index 254ccb779a..4b71e3c14e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -2117,7 +2117,7 @@ mod tests {
         let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
         assert_eq!(
             ret.err().unwrap().to_string(),
-            "Parquet error: Received empty union from remote ColumnOrder"
+            "Parquet error: Expected list element type of Struct but got List"
         );
     }
 
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index e621f4c498..b9d69b9d52 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -702,9 +702,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, 
R> for &'a [u8] {
 pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result<Vec<T>>
 where
     R: ThriftCompactInputProtocol<'a>,
-    T: ReadThrift<'a, R>,
+    T: ReadThrift<'a, R> + WriteThrift,
 {
     let list_ident = prot.read_list_begin()?;
+    validate_list_type(T::ELEMENT_TYPE, &list_ident)?;
     let mut res = Vec::with_capacity(list_ident.size as usize);
     for _ in 0..list_ident.size {
         let val = T::read_thrift(prot)?;
@@ -713,6 +714,17 @@ where
     Ok(res)
 }
 
+pub(crate) fn validate_list_type(expected: ElementType, got: &ListIdentifier) 
-> Result<()> {
+    if got.element_type != expected {
+        return Err(general_err!(
+            "Expected list element type of {:?} but got {:?}",
+            expected,
+            got.element_type
+        ));
+    }
+    Ok(())
+}
+
 /////////////////////////
 // thrift compact output
 
@@ -1155,4 +1167,20 @@ pub(crate) mod tests {
         let result = prot.read_list_begin();
         assert!(result.is_err(), "expected error, got {result:?}");
     }
+
+    #[test]
+    fn test_read_list_wrong_type() {
+        // list header: 4 elements of `Boolean`
+        let data = [0x42, 0x01];
+        let mut prot = ThriftSliceInputProtocol::new(&data);
+        // try to read as list<i32>
+        let result = read_thrift_vec::<i32, ThriftSliceInputProtocol>(&mut 
prot);
+        println!("{result:?}");
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Expected list element type of I32 but got Bool")
+        );
+    }
 }
diff --git a/parquet/tests/arrow_reader/bad_data.rs 
b/parquet/tests/arrow_reader/bad_data.rs
index d9b0d89e2c..56fddf505d 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -98,7 +98,7 @@ fn test_arrow_gh_41317() {
     let err = read_file("ARROW-GH-41317.parquet").unwrap_err();
     assert_eq!(
         err.to_string(),
-        "External: Parquet argument error: Parquet error: StructArrayReader 
out of sync in read_records, expected 5 read, got 2"
+        "Parquet error: Expected list element type of I32 but got I16"
     );
 }
 

Reply via email to