This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new fd1c5b391e Safely ignore Parquet fields with unimplemented Thrift
types (#9974)
fd1c5b391e is described below
commit fd1c5b391e169762a0981870c4e94baa3372d7a3
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 15 08:19:21 2026 -0700
Safely ignore Parquet fields with unimplemented Thrift types (#9974)
# Which issue does this PR close?
- Closes #9973.
# Rationale for this change
The thrift decoder should be able to skip fields with unimplemented
thrift types `set`, `map`, and `uuid`.
# What changes are included in this PR?
Flesh out the thrift enums and add code to the skip function to handle
the above types.
# Are these changes tested?
Yes, test is added
# Are there any user-facing changes?
No, changes are to internal APIs
---
parquet/src/parquet_thrift.rs | 60 +++++++++++++++++++++----------
parquet/tests/arrow_reader/bad_data.rs | 11 ++++++
parquet/tests/arrow_reader/new_types.bin | Bin 0 -> 56 bytes
3 files changed, 52 insertions(+), 19 deletions(-)
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 7567ace08b..cc6390b392 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -157,6 +157,7 @@ pub(crate) enum FieldType {
Set = 10,
Map = 11,
Struct = 12,
+ Uuid = 13,
}
impl TryFrom<u8> for FieldType {
@@ -176,25 +177,27 @@ impl TryFrom<u8> for FieldType {
10 => Ok(Self::Set),
11 => Ok(Self::Map),
12 => Ok(Self::Struct),
+ 13 => Ok(Self::Uuid),
_ => Err(ThriftProtocolError::InvalidFieldType(value)),
}
}
}
-impl TryFrom<ElementType> for FieldType {
- type Error = ThriftProtocolError;
- fn try_from(value: ElementType) -> std::result::Result<Self, Self::Error> {
+impl From<ElementType> for FieldType {
+ fn from(value: ElementType) -> Self {
match value {
- ElementType::Bool => Ok(Self::BooleanTrue),
- ElementType::Byte => Ok(Self::Byte),
- ElementType::I16 => Ok(Self::I16),
- ElementType::I32 => Ok(Self::I32),
- ElementType::I64 => Ok(Self::I64),
- ElementType::Double => Ok(Self::Double),
- ElementType::Binary => Ok(Self::Binary),
- ElementType::List => Ok(Self::List),
- ElementType::Struct => Ok(Self::Struct),
- _ => Err(ThriftProtocolError::InvalidFieldType(value as u8)),
+ ElementType::Bool => Self::BooleanTrue,
+ ElementType::Byte => Self::Byte,
+ ElementType::I16 => Self::I16,
+ ElementType::I32 => Self::I32,
+ ElementType::I64 => Self::I64,
+ ElementType::Double => Self::Double,
+ ElementType::Binary => Self::Binary,
+ ElementType::List => Self::List,
+ ElementType::Set => Self::Set,
+ ElementType::Map => Self::Map,
+ ElementType::Struct => Self::Struct,
+ ElementType::Uuid => Self::Uuid,
}
}
}
@@ -213,6 +216,7 @@ pub(crate) enum ElementType {
Set = 10,
Map = 11,
Struct = 12,
+ Uuid = 13,
}
impl TryFrom<u8> for ElementType {
@@ -235,6 +239,7 @@ impl TryFrom<u8> for ElementType {
10 => Ok(Self::Set),
11 => Ok(Self::Map),
12 => Ok(Self::Struct),
+ 13 => Ok(Self::Uuid),
_ => Err(ThriftProtocolError::InvalidElementType(value)),
}
}
@@ -499,27 +504,44 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
FieldType::I64 => self.skip_vlq().map(|_| ()),
FieldType::Double => self.skip_bytes(8).map(|_| ()),
FieldType::Binary => self.skip_binary().map(|_| ()),
+ // see
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct
FieldType::Struct => {
- let mut last_field_id = 0i16;
loop {
- let field_ident = self.read_field_begin(last_field_id)?;
+ // we don't need field id for skipping, so always pass 0
for last id
+ let field_ident = self.read_field_begin(0)?;
if field_ident.field_type == FieldType::Stop {
break;
}
self.skip_till_depth(field_ident.field_type, depth - 1)?;
- last_field_id = field_ident.id;
}
Ok(())
}
- FieldType::List => {
+ // lists and sets are encoded the same
+ // see
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
+ FieldType::List | FieldType::Set => {
let list_ident = self.read_list_begin()?;
+ let element_type = FieldType::from(list_ident.element_type);
for _ in 0..list_ident.size {
- let element_type =
FieldType::try_from(list_ident.element_type)?;
self.skip_till_depth(element_type, depth - 1)?;
}
Ok(())
}
- // no list or map types in parquet format
+ // see
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#map
+ FieldType::Map => {
+ let size = i32::try_from(self.read_vlq()?)?;
+ if size > 0 {
+ let kv = self.read_byte()?;
+ let key_type = FieldType::from(ElementType::try_from(kv >>
4)?);
+ let val_type = FieldType::from(ElementType::try_from(kv &
0xf)?);
+ for _ in 0..size {
+ self.skip_till_depth(key_type, depth - 1)?;
+ self.skip_till_depth(val_type, depth - 1)?;
+ }
+ }
+ Ok(())
+ }
+ // see
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#universal-unique-identifier-encoding
+ FieldType::Uuid => self.skip_bytes(16).map(|_| ()),
_ => Err(ThriftProtocolError::SkipUnsupportedType(field_type)),
}
}
diff --git a/parquet/tests/arrow_reader/bad_data.rs
b/parquet/tests/arrow_reader/bad_data.rs
index 56fddf505d..38fa69cdb1 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -21,6 +21,7 @@ use arrow::util::test_util::parquet_test_data;
use bytes::Bytes;
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
use parquet::errors::ParquetError;
+use parquet::file::metadata::ParquetMetaDataReader;
use std::collections::HashSet;
use std::path::PathBuf;
@@ -175,6 +176,16 @@ fn non_standard_delta_blocks() {
}
}
+#[test]
+fn skip_unknown_types() {
+ // test file contains a FileMetaData with unknown fields with
+ // types not currently used by Parquet (uuid, set, map). The
+ // parser should be able to skip these unknown fields without
+ // erroring.
+ let file = Bytes::from_static(include_bytes!("new_types.bin"));
+ ParquetMetaDataReader::decode_metadata(&file).unwrap();
+}
+
#[cfg(feature = "async")]
#[tokio::test]
#[allow(deprecated)]
diff --git a/parquet/tests/arrow_reader/new_types.bin
b/parquet/tests/arrow_reader/new_types.bin
new file mode 100644
index 0000000000..00d86fd630
Binary files /dev/null and b/parquet/tests/arrow_reader/new_types.bin differ