This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 30185d6116 perf: Remove `bool_val` from Parquet Thrift
`FieldIdentifier` (#9945)
30185d6116 is described below
commit 30185d6116c7ab6c50aa19d9c32defcf2937f720
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 15 07:24:13 2026 -0700
perf: Remove `bool_val` from Parquet Thrift `FieldIdentifier` (#9945)
# Which issue does this PR close?
- Closes #9946.
# Rationale for this change
Remove some unnecessary branching from a hot path in the Thrift parser,
improving performance and making for easier to read code.
# What changes are included in this PR?
Removes the `bool_val` field from `FieldIdentifier`.
# Are these changes tested?
Covered by existing tests
# Are there any user-facing changes?
No, this is an internal-only API
---
parquet/src/file/metadata/thrift/mod.rs | 10 +----
parquet/src/parquet_macros.rs | 5 +--
parquet/src/parquet_thrift.rs | 73 +++++++++++++++------------------
3 files changed, 36 insertions(+), 52 deletions(-)
diff --git a/parquet/src/file/metadata/thrift/mod.rs
b/parquet/src/file/metadata/thrift/mod.rs
index 75fcad871e..9be697c0fe 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -1135,13 +1135,7 @@ impl DataPageHeaderV2 {
repetition_levels_byte_length = Some(val);
}
7 => {
- if field_ident.bool_val.is_none() {
- return Err(general_err!(
- "Expected bool field but got thrift type {:?}",
- field_ident.field_type
- ));
- }
- is_compressed = field_ident.bool_val;
+ is_compressed = Some(field_ident.bool_val()?);
}
_ => {
prot.skip(field_ident.field_type)?;
@@ -1837,7 +1831,7 @@ pub(crate) mod tests {
fn assert_malformed_bool_error(err: crate::errors::ParquetError) {
let msg = err.to_string();
assert!(
- msg.contains("Expected bool field"),
+ msg.contains("Unexpected struct field type"),
"unexpected error message: {msg}"
);
}
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 13c4763431..8bb2ad23b0 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -464,10 +464,7 @@ macro_rules! __thrift_read_field {
$crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)?
};
($prot:tt, $field_ident:tt, bool) => {
- $field_ident.bool_val.ok_or_else(|| general_err!(
- "Expected bool field but got thrift type {:?}",
- $field_ident.field_type
- ))?
+ $field_ident.bool_val()?
};
($prot:tt, $field_ident:tt, $field_type:ident) => {
$field_type::read_thrift(&mut *$prot)?
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index b9d69b9d52..7567ace08b 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -59,9 +59,10 @@ impl From<ThriftProtocolError> for ParquetError {
match e {
ThriftProtocolError::Eof => eof_err!("Unexpected EOF"),
ThriftProtocolError::IO(e) => e.into(),
- ThriftProtocolError::InvalidFieldType(value) => {
- general_err!("Unexpected struct field type {}", value)
- }
+ ThriftProtocolError::InvalidFieldType(value) => match
FieldType::try_from(value) {
+ Ok(fld_type) => general_err!("Unexpected struct field type
{:?}", fld_type),
+ Err(_) => general_err!("Unexpected struct field type {}",
value),
+ },
ThriftProtocolError::InvalidElementType(value) => {
general_err!("Unexpected list/set element type {}", value)
}
@@ -247,11 +248,16 @@ pub(crate) struct FieldIdentifier {
pub(crate) field_type: FieldType,
/// The field's `id`. May be computed from delta or directly decoded.
pub(crate) id: i16,
- /// Stores the value for booleans.
- ///
- /// Boolean fields store no data, instead the field type is either boolean
true, or
- /// boolean false.
- pub(crate) bool_val: Option<bool>,
+}
+
+impl FieldIdentifier {
+ pub(crate) fn bool_val(&self) -> ThriftProtocolResult<bool> {
+ match self.field_type {
+ FieldType::BooleanTrue => Ok(true),
+ FieldType::BooleanFalse => Ok(false),
+ _ => Err(ThriftProtocolError::InvalidFieldType(self.field_type as
u8)),
+ }
+ }
}
/// Struct used to describe a [thrift list].
@@ -358,41 +364,28 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
// - the type
// - the field delta and the type
let field_type = self.read_byte()?;
- let field_delta = (field_type & 0xf0) >> 4;
- let field_type = FieldType::try_from(field_type & 0xf)?;
- let mut bool_val: Option<bool> = None;
-
- match field_type {
- FieldType::Stop => Ok(FieldIdentifier {
+ if field_type & 0xf == 0 {
+ return Ok(FieldIdentifier {
field_type: FieldType::Stop,
id: 0,
- bool_val,
- }),
- _ => {
- // special handling for bools
- if field_type == FieldType::BooleanFalse {
- bool_val = Some(false);
- } else if field_type == FieldType::BooleanTrue {
- bool_val = Some(true);
- }
- let field_id = if field_delta != 0 {
- last_field_id.checked_add(field_delta as i16).ok_or(
- ThriftProtocolError::FieldDeltaOverflow {
- field_delta,
- last_field_id,
- },
- )?
- } else {
- self.read_full_field_id()?
- };
-
- Ok(FieldIdentifier {
- field_type,
- id: field_id,
- bool_val,
- })
- }
+ });
}
+
+ let field_delta = (field_type & 0xf0) >> 4;
+ let field_type = FieldType::try_from(field_type & 0xf)?;
+
+ let id = if field_delta != 0 {
+ last_field_id.checked_add(field_delta as i16).ok_or(
+ ThriftProtocolError::FieldDeltaOverflow {
+ field_delta,
+ last_field_id,
+ },
+ )?
+ } else {
+ self.read_full_field_id()?
+ };
+
+ Ok(FieldIdentifier { field_type, id })
}
/// This is a specialized version of [`Self::read_field_begin`], solely
for use in parsing