This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1ddb4d9910 [PARQUET] Allow `UNKNOWN` logical type annotation on any
physical type (#9855)
1ddb4d9910 is described below
commit 1ddb4d99108590516cdaaaf37074cc8c51264320
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 1 12:11:51 2026 -0700
[PARQUET] Allow `UNKNOWN` logical type annotation on any physical type
(#9855)
# Which issue does this PR close?
- Closes #9844.
# Rationale for this change
Parquet writers for the most part only annotate `INT32` columns with the
`UNKNOWN` logical type annotation. This annotation is used to denote a
column which contains only null values for which the actual physical
type cannot be deduced. This crate assumes _only_ INT32 columns can be
so annotated, but there is no such requirement in the Parquet
[specification](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#unknown-always-null).
# What changes are included in this PR?
Modifies Parquet schema parsing to allow `UNKNOWN` on any type, and
ensures that when encountered, an arrow array reader for the `Null`
datatype is used.
# Are these changes tested?
Yes, unit test is added.
# Are there any user-facing changes?
No API changes, just a behavior change.
---
parquet/src/arrow/array_reader/builder.rs | 36 ++++++++-------
parquet/src/arrow/arrow_reader/mod.rs | 77 +++++++++++++++++++++++++++++--
parquet/src/arrow/schema/primitive.rs | 29 ++++++------
parquet/src/schema/types.rs | 2 +-
4 files changed, 111 insertions(+), 33 deletions(-)
diff --git a/parquet/src/arrow/array_reader/builder.rs
b/parquet/src/arrow/array_reader/builder.rs
index d806b2147a..0c575bf2e6 100644
--- a/parquet/src/arrow/array_reader/builder.rs
+++ b/parquet/src/arrow/array_reader/builder.rs
@@ -422,6 +422,20 @@ impl<'a> ArrayReaderBuilder<'a> {
let page_iterator = self.row_groups.column_chunks(col_idx)?;
let arrow_type = Some(field.arrow_type.clone());
+ // LogicalType::Unknown maps to DataType::Null. In the past it has
been assumed
+ // that only INT32 can have this annotation, but this is not required
by the Parquet
+ // specification. Since this can only annotate an entirely null
column, the data type
+ // used for the NullArrayReader should be irrelevant. It's just needed
to read the
+ // repetition and definition level data.
+ if matches!(arrow_type, Some(DataType::Null)) {
+ let reader = Box::new(NullArrayReader::<Int32Type>::new(
+ page_iterator,
+ column_desc,
+ self.batch_size,
+ )?) as _;
+ return Ok(Some(reader));
+ }
+
let reader = match physical_type {
PhysicalType::BOOLEAN =>
Box::new(PrimitiveArrayReader::<BoolType>::new(
page_iterator,
@@ -429,22 +443,12 @@ impl<'a> ArrayReaderBuilder<'a> {
arrow_type,
self.batch_size,
)?) as _,
- PhysicalType::INT32 => {
- if let Some(DataType::Null) = arrow_type {
- Box::new(NullArrayReader::<Int32Type>::new(
- page_iterator,
- column_desc,
- self.batch_size,
- )?) as _
- } else {
- Box::new(PrimitiveArrayReader::<Int32Type>::new(
- page_iterator,
- column_desc,
- arrow_type,
- self.batch_size,
- )?) as _
- }
- }
+ PhysicalType::INT32 =>
Box::new(PrimitiveArrayReader::<Int32Type>::new(
+ page_iterator,
+ column_desc,
+ arrow_type,
+ self.batch_size,
+ )?) as _,
PhysicalType::INT64 =>
Box::new(PrimitiveArrayReader::<Int64Type>::new(
page_iterator,
column_desc,
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index f9ef4eac65..70d3ce7cf9 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1600,13 +1600,13 @@ pub(crate) mod tests {
use crate::basic::{ConvertedType, Encoding, LogicalType, Repetition, Type
as PhysicalType};
use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE;
use crate::data_type::{
- BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray,
FixedLenByteArrayType,
- FloatType, Int32Type, Int64Type, Int96, Int96Type,
+ BoolType, ByteArray, ByteArrayType, DataType, DoubleType,
FixedLenByteArray,
+ FixedLenByteArrayType, FloatType, Int32Type, Int64Type, Int96,
Int96Type,
};
use crate::errors::Result;
use crate::file::metadata::{PageIndexPolicy, ParquetMetaData,
ParquetStatisticsPolicy};
use crate::file::properties::{EnabledStatistics, WriterProperties,
WriterVersion};
- use crate::file::writer::SerializedFileWriter;
+ use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter};
use crate::schema::parser::parse_message_type;
use crate::schema::types::{Type, TypePtr};
use crate::util::test_common::rand_gen::RandGen;
@@ -3686,6 +3686,77 @@ pub(crate) mod tests {
}
}
+ // test that we can handle the UNKNOWN logical type annotation on any
physical type
+ #[test]
+ fn test_unknown_logical_type() {
+ let message_type = "message uk {
+ OPTIONAL INT32 uki32 (UNKNOWN);
+ OPTIONAL INT64 uki64 (UNKNOWN);
+ OPTIONAL INT96 uki96 (UNKNOWN);
+ OPTIONAL BOOLEAN ukbool (UNKNOWN);
+ OPTIONAL FLOAT ukfloat (UNKNOWN);
+ OPTIONAL DOUBLE ukdbl (UNKNOWN);
+ OPTIONAL BYTE_ARRAY ukbytes (UNKNOWN);
+ OPTIONAL FIXED_LEN_BYTE_ARRAY(10) ukflba (UNKNOWN);
+ }";
+
+ let schema = Arc::new(parse_message_type(message_type).unwrap());
+ let file = tempfile::tempfile().unwrap();
+
+ let mut writer =
+ SerializedFileWriter::new(file.try_clone().unwrap(), schema,
Default::default())
+ .unwrap();
+
+ let mut row_group_writer = writer.next_row_group().unwrap();
+
+ fn write_nulls<T: DataType>(row_group_writer: &mut
SerializedRowGroupWriter<'_, File>) {
+ let mut column_writer =
row_group_writer.next_column().unwrap().unwrap();
+ // write out a bunch of nulls
+ column_writer
+ .typed::<T>()
+ .write_batch(&[], Some(&[0, 0, 0, 0]), None)
+ .unwrap();
+ column_writer.close().unwrap();
+ }
+
+ // INT32
+ write_nulls::<Int32Type>(&mut row_group_writer);
+
+ // INT64
+ write_nulls::<Int64Type>(&mut row_group_writer);
+
+ // INT96
+ write_nulls::<Int96Type>(&mut row_group_writer);
+
+ // BOOLEAN
+ write_nulls::<BoolType>(&mut row_group_writer);
+
+ // FLOAT
+ write_nulls::<FloatType>(&mut row_group_writer);
+
+ // DOUBLE
+ write_nulls::<DoubleType>(&mut row_group_writer);
+
+ // BYTE_ARRAY
+ write_nulls::<ByteArrayType>(&mut row_group_writer);
+
+ // FIXED_LEN_BYTE_ARRAY
+ write_nulls::<FixedLenByteArrayType>(&mut row_group_writer);
+
+ row_group_writer.close().unwrap();
+
+ writer.close().unwrap();
+
+ let mut reader = ParquetRecordBatchReader::try_new(file, 4).unwrap();
+ let batch = reader.next().unwrap().unwrap();
+
+ for col in batch.columns() {
+ assert_eq!(col.len(), 4);
+ assert_eq!(col.logical_null_count(), 4);
+ assert_eq!(*col.data_type(), ArrowDataType::Null);
+ }
+ }
+
#[test]
fn test_nested_nullability() {
let message_type = "message nested {
diff --git a/parquet/src/arrow/schema/primitive.rs
b/parquet/src/arrow/schema/primitive.rs
index 8959081bcb..b440753cc8 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -115,17 +115,22 @@ fn from_parquet(parquet_type: &Type) -> Result<DataType> {
scale,
precision,
..
- } => match physical_type {
- PhysicalType::BOOLEAN => Ok(DataType::Boolean),
- PhysicalType::INT32 => from_int32(basic_info, *scale, *precision),
- PhysicalType::INT64 => from_int64(basic_info, *scale, *precision),
- PhysicalType::INT96 =>
Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)),
- PhysicalType::FLOAT => Ok(DataType::Float32),
- PhysicalType::DOUBLE => Ok(DataType::Float64),
- PhysicalType::BYTE_ARRAY => from_byte_array(basic_info,
*precision, *scale),
- PhysicalType::FIXED_LEN_BYTE_ARRAY => {
- from_fixed_len_byte_array(basic_info, *scale, *precision,
*type_length)
- }
+ } => match basic_info.logical_type_ref() {
+ // Any physical type can have the UNKNOWN logical type annotation.
Check for that first.
+ //
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#unknown-always-null
+ Some(&LogicalType::Unknown) => Ok(DataType::Null),
+ _ => match physical_type {
+ PhysicalType::BOOLEAN => Ok(DataType::Boolean),
+ PhysicalType::INT32 => from_int32(basic_info, *scale,
*precision),
+ PhysicalType::INT64 => from_int64(basic_info, *scale,
*precision),
+ PhysicalType::INT96 =>
Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)),
+ PhysicalType::FLOAT => Ok(DataType::Float32),
+ PhysicalType::DOUBLE => Ok(DataType::Float64),
+ PhysicalType::BYTE_ARRAY => from_byte_array(basic_info,
*precision, *scale),
+ PhysicalType::FIXED_LEN_BYTE_ARRAY => {
+ from_fixed_len_byte_array(basic_info, *scale, *precision,
*type_length)
+ }
+ },
},
Type::GroupType { .. } => unreachable!(),
}
@@ -194,8 +199,6 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision:
i32) -> Result<DataTy
unit
)),
},
- //
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#unknown-always-null
- (Some(LogicalType::Unknown), _) => Ok(DataType::Null),
(None, ConvertedType::UINT_8) => Ok(DataType::UInt8),
(None, ConvertedType::UINT_16) => Ok(DataType::UInt16),
(None, ConvertedType::UINT_32) => Ok(DataType::UInt32),
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 2925557e7b..2c63da74df 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -398,7 +398,7 @@ impl<'a> PrimitiveTypeBuilder<'a> {
(LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
if *bit_width == 64 => {}
// Null type
- (LogicalType::Unknown, PhysicalType::INT32) => {}
+ (LogicalType::Unknown, _) => {}
(LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
(LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
(LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}