This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1377761779 Validate FIXED_LEN_BYTE_ARRAY length for DECIMAL and
INTERVAL types (#9985)
1377761779 is described below
commit 1377761779afb1ce70a7a9a9038a308d2ff1ab88
Author: eunsang <[email protected]>
AuthorDate: Fri May 29 03:05:56 2026 +0900
Validate FIXED_LEN_BYTE_ARRAY length for DECIMAL and INTERVAL types (#9985)
## Which issue does this PR close?
- Closes #9984.
## Rationale for this change
`from_fixed_len_byte_array` in `parquet/src/arrow/schema/primitive.rs`
does not validate `type_length`. While `PrimitiveTypeBuilder::build()`
enforces these constraints during schema construction
(`schema/types.rs:477` for INTERVAL, `:565-580` for DECIMAL), schemas
decoded directly from Thrift bypass that validation path entirely. As a
result:
- `DECIMAL` with a `type_length` outside `1..=32` was silently routed
through `Decimal128` / `Decimal256` using invalid parameters.
- `INTERVAL` with a `type_length != 12` silently returned
`Interval(DayTime)` regardless.
The same function already rejects `FLOAT16` when `type_length != 2`.
This PR mirrors that pattern for DECIMAL and INTERVAL, closing the TODO
introduced in #1682.
## What changes are included in this PR?
- Added a `check_decimal_length` helper to reject `type_length` values
outside `1..=32` for both `LogicalType::Decimal` and
`ConvertedType::DECIMAL`.
- Added an inline `type_length == 12` check for
`ConvertedType::INTERVAL`.
## Are these changes tested?
Yes. Added five new tests in
`parquet/src/arrow/schema/primitive.rs::tests` covering:
- Invalid lengths (`{-1, 0, 33}` for DECIMAL, `{0, 11, 13}` for
INTERVAL)
- Valid lengths (16 → `Decimal128`, 32 → `Decimal256`, 12 →
`Interval(DayTime)`)
To exercise the reader-side check, the tests construct a valid
`Type::PrimitiveType` via the builder and directly modify the
`type_length` on the resulting enum, simulating a malformed schema
decoded from Thrift.
## Are there any user-facing changes?
No public API changes. The only behavior change is on the reader side:
schemas with an out-of-range `type_length` for DECIMAL or INTERVAL will
now return a `ParquetError::General` instead of silently producing a
mismatched Arrow type.
---
parquet/src/arrow/schema/primitive.rs | 132 +++++++++++++++++++++++++++++++++-
1 file changed, 131 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/schema/primitive.rs
b/parquet/src/arrow/schema/primitive.rs
index 2272014a93..c70885355d 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -170,6 +170,16 @@ fn decimal_256_type(scale: i32, precision: i32) ->
Result<DataType> {
Ok(DataType::Decimal256(precision, scale))
}
+#[allow(clippy::manual_range_contains)]
+fn check_decimal_length(type_length: i32) -> Result<()> {
+ if type_length < 1 || type_length > 32 {
+ return Err(ParquetError::General(format!(
+ "DECIMAL must be a Fixed Length Byte Array with length 1 to 32,
got {type_length}"
+ )));
+ }
+ Ok(())
+}
+
fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) ->
Result<DataType> {
match (info.logical_type_ref(), info.converted_type()) {
(None, ConvertedType::NONE) => Ok(DataType::Int32),
@@ -293,9 +303,10 @@ fn from_fixed_len_byte_array(
precision: i32,
type_length: i32,
) -> Result<DataType> {
- // TODO: This should check the type length for the decimal and interval
types
match (info.logical_type_ref(), info.converted_type()) {
(Some(LogicalType::Decimal(decimal)), _) => {
+ check_decimal_length(type_length)?;
+ // lengths 1..=16 map to Decimal128, 17..=32 to Decimal256
if type_length <= 16 {
decimal_128_type(decimal.scale, decimal.precision)
} else {
@@ -303,6 +314,7 @@ fn from_fixed_len_byte_array(
}
}
(None, ConvertedType::DECIMAL) => {
+ check_decimal_length(type_length)?;
if type_length <= 16 {
decimal_128_type(scale, precision)
} else {
@@ -310,6 +322,11 @@ fn from_fixed_len_byte_array(
}
}
(None, ConvertedType::INTERVAL) => {
+ if type_length != 12 {
+ return Err(ParquetError::General(format!(
+ "INTERVAL must be a Fixed Length Byte Array with length
12, got {type_length}"
+ )));
+ }
// There is currently no reliable way of determining which
IntervalUnit
// to return. Thus without the original Arrow schema, the results
// would be incorrect if all 12 bytes of the interval are populated
@@ -328,3 +345,116 @@ fn from_fixed_len_byte_array(
_ => Ok(DataType::FixedSizeBinary(type_length)),
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::basic::{DecimalType, Repetition};
+ use crate::schema::types::Type;
+
+ // The PrimitiveTypeBuilder rejects bad lengths at construction. To
exercise
+ // the reader-side checks, build a valid type then overwrite its
type_length,
+ // simulating a schema decoded from a file that wasn't produced via the
builder.
+ fn with_type_length(ty: Type, type_length: i32) -> Type {
+ match ty {
+ Type::PrimitiveType {
+ basic_info,
+ physical_type,
+ precision,
+ scale,
+ ..
+ } => Type::PrimitiveType {
+ basic_info,
+ physical_type,
+ type_length,
+ precision,
+ scale,
+ },
+ _ => unreachable!(),
+ }
+ }
+
+ fn flba_decimal_logical(type_length: i32) -> Type {
+ let valid = Type::primitive_type_builder("c",
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+ .with_repetition(Repetition::REQUIRED)
+ .with_logical_type(Some(LogicalType::Decimal(DecimalType {
+ precision: 5,
+ scale: 2,
+ })))
+ .with_length(16)
+ .with_precision(5)
+ .with_scale(2)
+ .build()
+ .unwrap();
+ with_type_length(valid, type_length)
+ }
+
+ fn flba_decimal_converted(type_length: i32) -> Type {
+ let valid = Type::primitive_type_builder("c",
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+ .with_repetition(Repetition::REQUIRED)
+ .with_converted_type(ConvertedType::DECIMAL)
+ .with_length(16)
+ .with_precision(5)
+ .with_scale(2)
+ .build()
+ .unwrap();
+ with_type_length(valid, type_length)
+ }
+
+ fn flba_interval(type_length: i32) -> Type {
+ let valid = Type::primitive_type_builder("c",
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+ .with_repetition(Repetition::REQUIRED)
+ .with_converted_type(ConvertedType::INTERVAL)
+ .with_length(12)
+ .build()
+ .unwrap();
+ with_type_length(valid, type_length)
+ }
+
+ fn assert_err_contains(ty: &Type, needle: &str) {
+ let err = convert_primitive(ty, None).expect_err("expected an error");
+ let msg = err.to_string();
+ assert!(msg.contains(needle), "expected {needle:?} in error: {msg}");
+ }
+
+ #[test]
+ fn decimal_logical_rejects_invalid_length() {
+ for bad in [-1, 0, 33] {
+ assert_err_contains(&flba_decimal_logical(bad), "DECIMAL");
+ }
+ }
+
+ #[test]
+ fn decimal_converted_rejects_invalid_length() {
+ for bad in [-1, 0, 33] {
+ assert_err_contains(&flba_decimal_converted(bad), "DECIMAL");
+ }
+ }
+
+ #[test]
+ fn decimal_accepts_valid_lengths() {
+ assert!(matches!(
+ convert_primitive(&flba_decimal_logical(16), None).unwrap(),
+ DataType::Decimal128(_, _)
+ ));
+ assert!(matches!(
+ convert_primitive(&flba_decimal_logical(32), None).unwrap(),
+ DataType::Decimal256(_, _)
+ ));
+ }
+
+ #[test]
+ fn interval_rejects_wrong_length() {
+ for bad in [0, 11, 13] {
+ assert_err_contains(&flba_interval(bad), "INTERVAL");
+ }
+ }
+
+ #[test]
+ fn interval_accepts_length_12() {
+ assert_eq!(
+ convert_primitive(&flba_interval(12), None).unwrap(),
+ DataType::Interval(IntervalUnit::DayTime)
+ );
+ }
+}