This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new fe3c0c92bc fix(parquet): Avoid panic on malformed thrift bool fields
in parquet metadata (#9840)
fe3c0c92bc is described below
commit fe3c0c92bc4e3617cd82b74ca01376926ee98880
Author: BoazC-MSFT <[email protected]>
AuthorDate: Wed Apr 29 02:54:18 2026 +0300
fix(parquet): Avoid panic on malformed thrift bool fields in parquet
metadata (#9840)
Return a Parquet error when a compact thrift field is expected to be a
`bool` but the field header contains a non-bool wire type. This replaces
the remaining `bool_val` unwraps in generated thrift readers and the
hand-expanded `DataPageHeaderV2` reader.
Add regression tests for malformed `DictionaryPageHeader.is_sorted` and
`DataPageHeaderV2.is_compressed` headers to ensure corrupt input returns
an error instead of panicking.
# Which issue does this PR close?
- Closes #9839.
# Rationale for this change
See #9839
# What changes are included in this PR?
Proper error handling instead of using `unwrap`.
# Are these changes tested?
Yes. Added 2 UTs.
# Are there any user-facing changes?
An error string that can find its way all the way to the end user.
---
parquet/src/file/metadata/thrift/mod.rs | 90 +++++++++++++++++++++++++++++++--
parquet/src/parquet_macros.rs | 5 +-
2 files changed, 89 insertions(+), 6 deletions(-)
diff --git a/parquet/src/file/metadata/thrift/mod.rs
b/parquet/src/file/metadata/thrift/mod.rs
index 88cb96f355..ad8f6312c2 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -1128,8 +1128,13 @@ impl DataPageHeaderV2 {
repetition_levels_byte_length = Some(val);
}
7 => {
- let val = field_ident.bool_val.unwrap();
- is_compressed = Some(val);
+ if field_ident.bool_val.is_none() {
+ return Err(general_err!(
+ "Expected bool field but got thrift type {:?}",
+ field_ident.field_type
+ ));
+ }
+ is_compressed = field_ident.bool_val;
}
_ => {
prot.skip(field_ident.field_type)?;
@@ -1721,13 +1726,17 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);
#[cfg(test)]
pub(crate) mod tests {
- use crate::basic::Type as PhysicalType;
+ use crate::basic::{Encoding, PageType, Type as PhysicalType};
use crate::errors::Result;
- use crate::file::metadata::thrift::{BoundingBox, SchemaElement,
write_schema};
+ use crate::file::metadata::thrift::{
+ BoundingBox, DataPageHeaderV2, DictionaryPageHeader, PageHeader,
SchemaElement,
+ write_schema,
+ };
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions,
RowGroupMetaData};
use crate::parquet_thrift::tests::test_roundtrip;
use crate::parquet_thrift::{
- ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol,
read_thrift_vec,
+ ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol,
WriteThrift,
+ read_thrift_vec,
};
use crate::schema::types::{
ColumnDescriptor, ColumnPath, SchemaDescriptor, TypePtr, num_nodes,
@@ -1794,6 +1803,29 @@ pub(crate) mod tests {
read_thrift_vec(&mut prot)
}
+ fn thrift_bytes<T: WriteThrift>(value: &T) -> Vec<u8> {
+ let mut buf = Vec::new();
+ let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
+ value.write_thrift(&mut writer).unwrap();
+ buf
+ }
+
+ fn change_false_bool_field_to_i32(buf: &mut [u8]) {
+ let pos = buf
+ .iter()
+ .rposition(|byte| *byte == 0x12)
+ .expect("expected BOOL_FALSE field header byte");
+ buf[pos] = 0x15;
+ }
+
+ fn assert_malformed_bool_error(err: crate::errors::ParquetError) {
+ let msg = err.to_string();
+ assert!(
+ msg.contains("Expected bool field"),
+ "unexpected error message: {msg}"
+ );
+ }
+
#[test]
fn test_bounding_box_roundtrip() {
test_roundtrip(BoundingBox {
@@ -1873,4 +1905,52 @@ pub(crate) mod tests {
.unwrap();
assert_eq!(decoded_zero.null_count_opt(), Some(0));
}
+
+ #[test]
+ fn malformed_bool_field_returns_error_not_panic() {
+ let page_header = PageHeader {
+ r#type: PageType::DICTIONARY_PAGE,
+ uncompressed_page_size: 1,
+ compressed_page_size: 1,
+ crc: None,
+ data_page_header: None,
+ index_page_header: None,
+ dictionary_page_header: Some(DictionaryPageHeader {
+ num_values: 1,
+ encoding: Encoding::PLAIN,
+ is_sorted: Some(false),
+ }),
+ data_page_header_v2: None,
+ };
+
+ let mut buf = thrift_bytes(&page_header);
+ change_false_bool_field_to_i32(&mut buf);
+
+ let mut prot = ThriftSliceInputProtocol::new(&buf);
+ let err = PageHeader::read_thrift_without_stats(&mut prot)
+ .expect_err("malformed bool field should return an error");
+ assert_malformed_bool_error(err);
+ }
+
+ #[test]
+ fn malformed_data_page_v2_bool_field_returns_error_not_panic() {
+ let data_page_header_v2 = DataPageHeaderV2 {
+ num_values: 1,
+ num_nulls: 0,
+ num_rows: 1,
+ encoding: Encoding::PLAIN,
+ definition_levels_byte_length: 0,
+ repetition_levels_byte_length: 0,
+ is_compressed: Some(false),
+ statistics: None,
+ };
+
+ let mut buf = thrift_bytes(&data_page_header_v2);
+ change_false_bool_field_to_i32(&mut buf);
+
+ let mut prot = ThriftSliceInputProtocol::new(&buf);
+ let err = DataPageHeaderV2::read_thrift_without_stats(&mut prot)
+ .expect_err("malformed bool field should return an error");
+ assert_malformed_bool_error(err);
+ }
}
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 714015e10e..13c4763431 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -464,7 +464,10 @@ macro_rules! __thrift_read_field {
$crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)?
};
($prot:tt, $field_ident:tt, bool) => {
- $field_ident.bool_val.unwrap()
+ $field_ident.bool_val.ok_or_else(|| general_err!(
+ "Expected bool field but got thrift type {:?}",
+ $field_ident.field_type
+ ))?
};
($prot:tt, $field_ident:tt, $field_type:ident) => {
$field_type::read_thrift(&mut *$prot)?