This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 8bed541f31 feat: Support round trip reading/writing Arrow type 
`Dictionary(_, FixedSizeBinary(_))` to Parquet (#7446)
8bed541f31 is described below

commit 8bed541f314dfc6adfa1f7b46d4a43fd4118e7c7
Author: albertlockett <[email protected]>
AuthorDate: Fri May 9 05:55:50 2025 -0400

    feat: Support round trip reading/writing Arrow type `Dictionary(_, 
FixedSizeBinary(_))` to Parquet (#7446)
    
    * support FixedSizedBinary in dict encoding
    
    * roundtrip works
    
    * cleanup
    
    * clippy and linter
    
    * support all types of keys in byte_array_dictionary
    
    * back out change included by mistake
    
    * linter
    
    * PR feedback before cleanup
    
    * PR feedback from Weston
    
    ---------
    
    Co-authored-by: albertlockett <[email protected]>
---
 parquet/src/arrow/array_reader/builder.rs          |  9 ++--
 .../arrow/array_reader/byte_array_dictionary.rs    | 16 ++++----
 parquet/src/arrow/arrow_writer/byte_array.rs       |  7 +++-
 parquet/src/arrow/arrow_writer/mod.rs              | 48 ++++++++++++++++++++++
 parquet/src/arrow/buffer/dictionary_buffer.rs      |  9 ++++
 5 files changed, 76 insertions(+), 13 deletions(-)

diff --git a/parquet/src/arrow/array_reader/builder.rs 
b/parquet/src/arrow/array_reader/builder.rs
index 945f62526a..5ada61e93d 100644
--- a/parquet/src/arrow/array_reader/builder.rs
+++ b/parquet/src/arrow/array_reader/builder.rs
@@ -289,9 +289,12 @@ fn build_primitive_reader(
             }
             _ => make_byte_array_reader(page_iterator, column_desc, 
arrow_type)?,
         },
-        PhysicalType::FIXED_LEN_BYTE_ARRAY => {
-            make_fixed_len_byte_array_reader(page_iterator, column_desc, 
arrow_type)?
-        }
+        PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type {
+            Some(DataType::Dictionary(_, _)) => {
+                make_byte_array_dictionary_reader(page_iterator, column_desc, 
arrow_type)?
+            }
+            _ => make_fixed_len_byte_array_reader(page_iterator, column_desc, 
arrow_type)?,
+        },
     };
     Ok(Some(reader))
 }
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs 
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 440db641a2..757d3df8a8 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -90,21 +90,21 @@ pub fn make_byte_array_dictionary_reader(
         ArrowType::Dictionary(key_type, value_type) => {
             make_reader! {
                 (pages, column_desc, data_type) => match (key_type.as_ref(), 
value_type.as_ref()) {
-                    (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8) => 
(u8, i32),
+                    (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (u8, i32),
                     (ArrowType::UInt8, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (u8, i64),
-                    (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8) => 
(i8, i32),
+                    (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (i8, i32),
                     (ArrowType::Int8, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (i8, i64),
-                    (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8) 
=> (u16, i32),
+                    (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (u16, i32),
                     (ArrowType::UInt16, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (u16, i64),
-                    (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8) => 
(i16, i32),
+                    (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (i16, i32),
                     (ArrowType::Int16, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (i16, i64),
-                    (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8) 
=> (u32, i32),
+                    (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (u32, i32),
                     (ArrowType::UInt32, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (u32, i64),
-                    (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8) => 
(i32, i32),
+                    (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (i32, i32),
                     (ArrowType::Int32, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (i32, i64),
-                    (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8) 
=> (u64, i32),
+                    (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (u64, i32),
                     (ArrowType::UInt64, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (u64, i64),
-                    (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8) => 
(i64, i32),
+                    (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8 | 
ArrowType::FixedSizeBinary(_)) => (i64, i32),
                     (ArrowType::Int64, ArrowType::LargeBinary | 
ArrowType::LargeUtf8) => (i64, i64),
                 }
             }
diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs 
b/parquet/src/arrow/arrow_writer/byte_array.rs
index 2d23ad8510..9767ec98e6 100644
--- a/parquet/src/arrow/arrow_writer/byte_array.rs
+++ b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -27,8 +27,8 @@ use crate::schema::types::ColumnDescPtr;
 use crate::util::bit_util::num_required_bits;
 use crate::util::interner::{Interner, Storage};
 use arrow_array::{
-    Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, 
LargeBinaryArray,
-    LargeStringArray, StringArray, StringViewArray,
+    Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, 
FixedSizeBinaryArray,
+    LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
 };
 use arrow_schema::DataType;
 
@@ -85,6 +85,9 @@ macro_rules! downcast_op {
                 DataType::LargeBinary => {
                     downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, 
$arg)*)
                 }
+                DataType::FixedSizeBinary(_) => {
+                    downcast_dict_op!(key, FixedSizeBinaryArray, $array, 
$op$(, $arg)*)
+                }
                 d => unreachable!("cannot downcast {} dictionary value to byte 
array", d),
             },
             d => unreachable!("cannot downcast {} to byte array", d),
diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index 1e1054c9a0..66e1b06fa7 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -989,6 +989,9 @@ impl ArrowColumnWriterFactory {
                 ArrowDataType::Utf8View | ArrowDataType::BinaryView => {
                     out.push(bytes(leaves.next().unwrap())?)
                 }
+                ArrowDataType::FixedSizeBinary(_) => {
+                    out.push(bytes(leaves.next().unwrap())?)
+                }
                 _ => {
                     out.push(col(leaves.next().unwrap())?)
                 }
@@ -1333,6 +1336,7 @@ mod tests {
     use arrow_buffer::{i256, IntervalDayTime, IntervalMonthDayNano, 
NullBuffer};
     use arrow_schema::Fields;
     use half::f16;
+    use num::{FromPrimitive, ToPrimitive};
 
     use crate::basic::Encoding;
     use crate::data_type::AsBytes;
@@ -1911,6 +1915,50 @@ mod tests {
         roundtrip(batch, Some(SMALL_SIZE / 2));
     }
 
+    #[test]
+    fn test_fixed_size_binary_in_dict() {
+        fn test_fixed_size_binary_in_dict_inner<K>()
+        where
+            K: ArrowDictionaryKeyType,
+            K::Native: FromPrimitive + ToPrimitive + TryFrom<u8>,
+            <<K as arrow_array::ArrowPrimitiveType>::Native as 
TryFrom<u8>>::Error: std::fmt::Debug,
+        {
+            let field = Field::new(
+                "a",
+                DataType::Dictionary(
+                    Box::new(K::DATA_TYPE),
+                    Box::new(DataType::FixedSizeBinary(4)),
+                ),
+                false,
+            );
+            let schema = Schema::new(vec![field]);
+
+            let keys: Vec<K::Native> = vec![
+                K::Native::try_from(0u8).unwrap(),
+                K::Native::try_from(0u8).unwrap(),
+                K::Native::try_from(1u8).unwrap(),
+            ];
+            let keys = PrimitiveArray::<K>::from_iter_values(keys);
+            let values = FixedSizeBinaryArray::try_from_iter(
+                vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(),
+            )
+            .unwrap();
+
+            let data = DictionaryArray::<K>::new(keys, Arc::new(values));
+            let batch = RecordBatch::try_new(Arc::new(schema), 
vec![Arc::new(data)]).unwrap();
+            roundtrip(batch, None);
+        }
+
+        test_fixed_size_binary_in_dict_inner::<UInt8Type>();
+        test_fixed_size_binary_in_dict_inner::<UInt16Type>();
+        test_fixed_size_binary_in_dict_inner::<UInt32Type>();
+        test_fixed_size_binary_in_dict_inner::<UInt16Type>();
+        test_fixed_size_binary_in_dict_inner::<Int8Type>();
+        test_fixed_size_binary_in_dict_inner::<Int16Type>();
+        test_fixed_size_binary_in_dict_inner::<Int32Type>();
+        test_fixed_size_binary_in_dict_inner::<Int64Type>();
+    }
+
     #[test]
     fn test_empty_dict() {
         let struct_fields = Fields::from(vec![Field::new(
diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs 
b/parquet/src/arrow/buffer/dictionary_buffer.rs
index 59f1cfa056..3861776393 100644
--- a/parquet/src/arrow/buffer/dictionary_buffer.rs
+++ b/parquet/src/arrow/buffer/dictionary_buffer.rs
@@ -154,6 +154,15 @@ impl<K: ArrowNativeType + Ord, V: OffsetSizeTrait> 
DictionaryBuffer<K, V> {
                     }
                 }
 
+                let ArrowType::Dictionary(_, value_type) = data_type else {
+                    unreachable!()
+                };
+                let values = if let ArrowType::FixedSizeBinary(size) = 
**value_type {
+                    arrow_cast::cast(&values, 
&ArrowType::FixedSizeBinary(size)).unwrap()
+                } else {
+                    values
+                };
+
                 let builder = ArrayDataBuilder::new(data_type.clone())
                     .len(keys.len())
                     .add_buffer(Buffer::from_vec(keys))

Reply via email to