This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 1ddb4d9910 [PARQUET] Allow `UNKNOWN` logical type annotation on any 
physical type (#9855)
1ddb4d9910 is described below

commit 1ddb4d99108590516cdaaaf37074cc8c51264320
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 1 12:11:51 2026 -0700

    [PARQUET] Allow `UNKNOWN` logical type annotation on any physical type 
(#9855)
    
    # Which issue does this PR close?
    
    - Closes #9844.
    
    # Rationale for this change
    
    Parquet writers for the most part only annotate `INT32` columns with the
    `UNKNOWN` logical type annotation. This annotation is used to denote a
    column which contains only null values for which the actual physical
    type cannot be deduced. This crate assumes _only_ INT32 columns can be
    so annotated, but there is no such requirement in the Parquet
    
[specification](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#unknown-always-null).
    
    # What changes are included in this PR?
    
    Modifies Parquet schema parsing to allow `UNKNOWN` on any type, and
    ensures that when encountered, an arrow array reader for the `Null`
    datatype is used.
    
    # Are these changes tested?
    
    Yes, unit test is added.
    
    # Are there any user-facing changes?
    
    No API changes, just a behavior change.
---
 parquet/src/arrow/array_reader/builder.rs | 36 ++++++++-------
 parquet/src/arrow/arrow_reader/mod.rs     | 77 +++++++++++++++++++++++++++++--
 parquet/src/arrow/schema/primitive.rs     | 29 ++++++------
 parquet/src/schema/types.rs               |  2 +-
 4 files changed, 111 insertions(+), 33 deletions(-)

diff --git a/parquet/src/arrow/array_reader/builder.rs 
b/parquet/src/arrow/array_reader/builder.rs
index d806b2147a..0c575bf2e6 100644
--- a/parquet/src/arrow/array_reader/builder.rs
+++ b/parquet/src/arrow/array_reader/builder.rs
@@ -422,6 +422,20 @@ impl<'a> ArrayReaderBuilder<'a> {
         let page_iterator = self.row_groups.column_chunks(col_idx)?;
         let arrow_type = Some(field.arrow_type.clone());
 
+        // LogicalType::Unknown maps to DataType::Null. In the past it has 
been assumed
+        // that only INT32 can have this annotation, but this is not required 
by the Parquet
+        // specification. Since this can only annotate an entirely null 
column, the data type
+        // used for the NullArrayReader should be irrelevant. It's just needed 
to read the
+        // repetition and definition level data.
+        if matches!(arrow_type, Some(DataType::Null)) {
+            let reader = Box::new(NullArrayReader::<Int32Type>::new(
+                page_iterator,
+                column_desc,
+                self.batch_size,
+            )?) as _;
+            return Ok(Some(reader));
+        }
+
         let reader = match physical_type {
             PhysicalType::BOOLEAN => 
Box::new(PrimitiveArrayReader::<BoolType>::new(
                 page_iterator,
@@ -429,22 +443,12 @@ impl<'a> ArrayReaderBuilder<'a> {
                 arrow_type,
                 self.batch_size,
             )?) as _,
-            PhysicalType::INT32 => {
-                if let Some(DataType::Null) = arrow_type {
-                    Box::new(NullArrayReader::<Int32Type>::new(
-                        page_iterator,
-                        column_desc,
-                        self.batch_size,
-                    )?) as _
-                } else {
-                    Box::new(PrimitiveArrayReader::<Int32Type>::new(
-                        page_iterator,
-                        column_desc,
-                        arrow_type,
-                        self.batch_size,
-                    )?) as _
-                }
-            }
+            PhysicalType::INT32 => 
Box::new(PrimitiveArrayReader::<Int32Type>::new(
+                page_iterator,
+                column_desc,
+                arrow_type,
+                self.batch_size,
+            )?) as _,
             PhysicalType::INT64 => 
Box::new(PrimitiveArrayReader::<Int64Type>::new(
                 page_iterator,
                 column_desc,
diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index f9ef4eac65..70d3ce7cf9 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1600,13 +1600,13 @@ pub(crate) mod tests {
     use crate::basic::{ConvertedType, Encoding, LogicalType, Repetition, Type 
as PhysicalType};
     use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE;
     use crate::data_type::{
-        BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, 
FixedLenByteArrayType,
-        FloatType, Int32Type, Int64Type, Int96, Int96Type,
+        BoolType, ByteArray, ByteArrayType, DataType, DoubleType, 
FixedLenByteArray,
+        FixedLenByteArrayType, FloatType, Int32Type, Int64Type, Int96, 
Int96Type,
     };
     use crate::errors::Result;
     use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, 
ParquetStatisticsPolicy};
     use crate::file::properties::{EnabledStatistics, WriterProperties, 
WriterVersion};
-    use crate::file::writer::SerializedFileWriter;
+    use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter};
     use crate::schema::parser::parse_message_type;
     use crate::schema::types::{Type, TypePtr};
     use crate::util::test_common::rand_gen::RandGen;
@@ -3686,6 +3686,77 @@ pub(crate) mod tests {
         }
     }
 
+    // test that we can handle the UNKNOWN logical type annotation on any 
physical type
+    #[test]
+    fn test_unknown_logical_type() {
+        let message_type = "message uk {
+            OPTIONAL INT32 uki32 (UNKNOWN);
+            OPTIONAL INT64 uki64 (UNKNOWN);
+            OPTIONAL INT96 uki96 (UNKNOWN);
+            OPTIONAL BOOLEAN ukbool (UNKNOWN);
+            OPTIONAL FLOAT ukfloat (UNKNOWN);
+            OPTIONAL DOUBLE ukdbl (UNKNOWN);
+            OPTIONAL BYTE_ARRAY ukbytes (UNKNOWN);
+            OPTIONAL FIXED_LEN_BYTE_ARRAY(10) ukflba (UNKNOWN);
+        }";
+
+        let schema = Arc::new(parse_message_type(message_type).unwrap());
+        let file = tempfile::tempfile().unwrap();
+
+        let mut writer =
+            SerializedFileWriter::new(file.try_clone().unwrap(), schema, 
Default::default())
+                .unwrap();
+
+        let mut row_group_writer = writer.next_row_group().unwrap();
+
+        fn write_nulls<T: DataType>(row_group_writer: &mut 
SerializedRowGroupWriter<'_, File>) {
+            let mut column_writer = 
row_group_writer.next_column().unwrap().unwrap();
+            // write out a bunch of nulls
+            column_writer
+                .typed::<T>()
+                .write_batch(&[], Some(&[0, 0, 0, 0]), None)
+                .unwrap();
+            column_writer.close().unwrap();
+        }
+
+        // INT32
+        write_nulls::<Int32Type>(&mut row_group_writer);
+
+        // INT64
+        write_nulls::<Int64Type>(&mut row_group_writer);
+
+        // INT96
+        write_nulls::<Int96Type>(&mut row_group_writer);
+
+        // BOOLEAN
+        write_nulls::<BoolType>(&mut row_group_writer);
+
+        // FLOAT
+        write_nulls::<FloatType>(&mut row_group_writer);
+
+        // DOUBLE
+        write_nulls::<DoubleType>(&mut row_group_writer);
+
+        // BYTE_ARRAY
+        write_nulls::<ByteArrayType>(&mut row_group_writer);
+
+        // FIXED_LEN_BYTE_ARRAY
+        write_nulls::<FixedLenByteArrayType>(&mut row_group_writer);
+
+        row_group_writer.close().unwrap();
+
+        writer.close().unwrap();
+
+        let mut reader = ParquetRecordBatchReader::try_new(file, 4).unwrap();
+        let batch = reader.next().unwrap().unwrap();
+
+        for col in batch.columns() {
+            assert_eq!(col.len(), 4);
+            assert_eq!(col.logical_null_count(), 4);
+            assert_eq!(*col.data_type(), ArrowDataType::Null);
+        }
+    }
+
     #[test]
     fn test_nested_nullability() {
         let message_type = "message nested {
diff --git a/parquet/src/arrow/schema/primitive.rs 
b/parquet/src/arrow/schema/primitive.rs
index 8959081bcb..b440753cc8 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -115,17 +115,22 @@ fn from_parquet(parquet_type: &Type) -> Result<DataType> {
             scale,
             precision,
             ..
-        } => match physical_type {
-            PhysicalType::BOOLEAN => Ok(DataType::Boolean),
-            PhysicalType::INT32 => from_int32(basic_info, *scale, *precision),
-            PhysicalType::INT64 => from_int64(basic_info, *scale, *precision),
-            PhysicalType::INT96 => 
Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)),
-            PhysicalType::FLOAT => Ok(DataType::Float32),
-            PhysicalType::DOUBLE => Ok(DataType::Float64),
-            PhysicalType::BYTE_ARRAY => from_byte_array(basic_info, 
*precision, *scale),
-            PhysicalType::FIXED_LEN_BYTE_ARRAY => {
-                from_fixed_len_byte_array(basic_info, *scale, *precision, 
*type_length)
-            }
+        } => match basic_info.logical_type_ref() {
+            // Any physical type can have the UNKNOWN logical type annotation. 
Check for that first.
+            // 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#unknown-always-null
+            Some(&LogicalType::Unknown) => Ok(DataType::Null),
+            _ => match physical_type {
+                PhysicalType::BOOLEAN => Ok(DataType::Boolean),
+                PhysicalType::INT32 => from_int32(basic_info, *scale, 
*precision),
+                PhysicalType::INT64 => from_int64(basic_info, *scale, 
*precision),
+                PhysicalType::INT96 => 
Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)),
+                PhysicalType::FLOAT => Ok(DataType::Float32),
+                PhysicalType::DOUBLE => Ok(DataType::Float64),
+                PhysicalType::BYTE_ARRAY => from_byte_array(basic_info, 
*precision, *scale),
+                PhysicalType::FIXED_LEN_BYTE_ARRAY => {
+                    from_fixed_len_byte_array(basic_info, *scale, *precision, 
*type_length)
+                }
+            },
         },
         Type::GroupType { .. } => unreachable!(),
     }
@@ -194,8 +199,6 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision: 
i32) -> Result<DataTy
                 unit
             )),
         },
-        // 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#unknown-always-null
-        (Some(LogicalType::Unknown), _) => Ok(DataType::Null),
         (None, ConvertedType::UINT_8) => Ok(DataType::UInt8),
         (None, ConvertedType::UINT_16) => Ok(DataType::UInt16),
         (None, ConvertedType::UINT_32) => Ok(DataType::UInt32),
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 2925557e7b..2c63da74df 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -398,7 +398,7 @@ impl<'a> PrimitiveTypeBuilder<'a> {
                 (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
                     if *bit_width == 64 => {}
                 // Null type
-                (LogicalType::Unknown, PhysicalType::INT32) => {}
+                (LogicalType::Unknown, _) => {}
                 (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
                 (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
                 (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}

Reply via email to