This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 1ba5d486a4 [Variant] Preserve `UUID` extension type metadata for 
Parquet writer (#10015)
1ba5d486a4 is described below

commit 1ba5d486a45b234d5bdc5e91850c241d5a7bccec
Author: Konstantin Tarasov <[email protected]>
AuthorDate: Thu Jun 11 17:17:42 2026 -0400

    [Variant] Preserve `UUID` extension type metadata for Parquet writer 
(#10015)
    
    # Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    -->
    
    - Closes ##8420.
    
    # Rationale for this change
    
    Shredding into `FixedSizeBinary(16)` means we're shredding into `UUID`
    Parquet logical type. `shred_variant` currently doesn't preserve
    extension type metadata for the typed value field.
    
    UUID is the only valid `Variant` shredding type that requires an arrow
    extension type.
    https://github.com/apache/parquet-format/blob/master/VariantShredding.md
    
    Earlier in
    [#](https://github.com/apache/arrow-rs/issues/8665#issuecomment-3423325006)
    @scovich mentioned:
    
    > Yeah, as long as `shred_variant` only takes a `DataType` instead of a
    `Field`, we are forced to assume 16-byte fixed binary is UUID. If it
    accepted a `Field`, we should additionally require the UUID extension
    type. Otherwise, we potentially run into problems because Decimal128 can
    _also_ use 16-byte fixed binary!
    
    This is an argument proposing to use `Field` instead of `DataType` for
    `as_type` parameter in `shred_variant`. This should not be an issue
    because arrow has a `Decimal128Type` to represent `Decimal128` logical
    Parquet type. This way there's no ambiguity in using
    `FixedSizeBinary(16)` arrow type to represent `UUID`. Switching
    `as_type` to `Field` is unnecessary.
    
    <!--
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    -->
    
    # What changes are included in this PR?
    
    - `VariantArray::from_parts/ShreddedVariantFieldArray::from_parts` now
    add `UUID` extension type metadata to the typed_value `Field` if
    `DataType` is `FixedSizeBinary(16)`
    - Uncommented `UUID` extension part metadata validation in a unit test.
    
    <!--
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    -->
    
    # Are these changes tested?
    
    - Yes, unit test.
    <!--
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    
    If this PR claims a performance improvement, please include evidence
    such as benchmark results.
    -->
    
    # Are there any user-facing changes?
    
    - Shredded `UUID` typed value fields now preserve `UUID` extension type
    metadata.
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
    -->
    
    ---------
    
    Co-authored-by: Ryan Johnson <[email protected]>
---
 parquet-variant-compute/src/shred_variant.rs | 42 +++++++++-----
 parquet-variant-compute/src/variant_array.rs | 84 +++++++++++++++++++++++++---
 2 files changed, 106 insertions(+), 20 deletions(-)

diff --git a/parquet-variant-compute/src/shred_variant.rs 
b/parquet-variant-compute/src/shred_variant.rs
index 440f4b7165..8f3963cce0 100644
--- a/parquet-variant-compute/src/shred_variant.rs
+++ b/parquet-variant-compute/src/shred_variant.rs
@@ -1189,19 +1189,9 @@ mod tests {
 
         let variant_array = shred_variant(&input, 
&DataType::FixedSizeBinary(16)).unwrap();
 
-        // // inspect the typed_value Field and make sure it contains the 
canonical Uuid extension type
-        // let typed_value_field = variant_array
-        //     .inner()
-        //     .fields()
-        //     .into_iter()
-        //     .find(|f| f.name() == "typed_value")
-        //     .unwrap();
-
-        // assert!(
-        //     typed_value_field
-        //         .try_extension_type::<extension::Uuid>()
-        //         .is_ok()
-        // );
+        let typed_value_field = 
variant_array.inner().field_by_name("typed_value").unwrap();
+
+        
assert!(typed_value_field.has_valid_extension_type::<arrow_schema::extension::Uuid>());
 
         // probe the downcasted typed_value array to make sure uuids are 
shredded correctly
         let uuids = variant_array
@@ -1227,6 +1217,32 @@ mod tests {
         assert_eq!(got_uuid_2, mock_uuid_2.as_bytes());
     }
 
+    #[test]
+    fn test_uuid_nested_shredding() {
+        let mock_uuid = Uuid::new_v4();
+        let input = build_variant_array(vec![VariantRow::Object(vec![(
+            "id",
+            VariantValue::from(mock_uuid),
+        )])]);
+        let target = ShreddedSchemaBuilder::default()
+            .with_path("id", DataType::FixedSizeBinary(16))
+            .unwrap()
+            .build();
+
+        let result = shred_variant(&input, &target).unwrap();
+
+        let typed_value = result.typed_value_field().unwrap();
+        let typed_struct = 
typed_value.as_any().downcast_ref::<StructArray>().unwrap();
+        let id =
+            
ShreddedVariantFieldArray::try_new(typed_struct.column_by_name("id").unwrap()).unwrap();
+
+        // The extension type lives on the field, not the array, so assert it 
on the inner struct.
+        let leaf = id.inner().field_by_name("typed_value").unwrap();
+
+        assert_eq!(leaf.data_type(), &DataType::FixedSizeBinary(16));
+        
assert!(leaf.has_valid_extension_type::<arrow_schema::extension::Uuid>());
+    }
+
     #[test]
     fn test_primitive_shredding_comprehensive() {
         // Test mixed scenarios in a single array
diff --git a/parquet-variant-compute/src/variant_array.rs 
b/parquet-variant-compute/src/variant_array.rs
index a6ee281002..d773fe475d 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -31,7 +31,7 @@ use arrow::datatypes::{
     TimestampMicrosecondType, TimestampNanosecondType,
 };
 use arrow::error::Result;
-use arrow_schema::extension::ExtensionType;
+use arrow_schema::extension::{ExtensionType, Uuid as UuidExtension};
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
 use chrono::{DateTime, NaiveTime};
 use parquet_variant::{
@@ -327,7 +327,7 @@ impl VariantArray {
             builder = builder.with_field("value", value, true);
         }
         if let Some(typed_value) = typed_value.clone() {
-            builder = builder.with_field("typed_value", typed_value, true);
+            builder = builder.with_field_ref(typed_value_field(&typed_value), 
typed_value);
         }
         if let Some(nulls) = nulls {
             builder = builder.with_nulls(nulls);
@@ -713,7 +713,7 @@ impl ShreddedVariantFieldArray {
             builder = builder.with_field("value", value, true);
         }
         if let Some(typed_value) = typed_value.clone() {
-            builder = builder.with_field("typed_value", typed_value, true);
+            builder = builder.with_field_ref(typed_value_field(&typed_value), 
typed_value);
         }
         if let Some(nulls) = nulls {
             builder = builder.with_nulls(nulls);
@@ -868,6 +868,19 @@ impl TryFrom<&StructArray> for ShreddingState {
     }
 }
 
+/// Build the `typed_value` [`FieldRef`] for a shredded column.
+///
+/// The Variant spec maps `FixedSizeBinary(16)` exclusively to UUID, so any
+/// shredded column of that type must carry the canonical [`UuidExtension`]
+/// extension metadata on its field.
+fn typed_value_field(array: &ArrayRef) -> FieldRef {
+    let mut field = Field::new("typed_value", array.data_type().clone(), true);
+    if matches!(array.data_type(), DataType::FixedSizeBinary(16)) {
+        field = field.with_extension_type(UuidExtension);
+    }
+    Arc::new(field)
+}
+
 /// Builds struct arrays from component fields
 ///
 /// TODO: move to arrow crate
@@ -891,6 +904,16 @@ impl StructArrayBuilder {
         self
     }
 
+    /// Add an array to this struct array using a caller-supplied [`FieldRef`].
+    ///
+    /// Use this when the field carries metadata (e.g. an extension type) that
+    /// would be lost if the field were synthesized from the array's data type 
alone.
+    pub fn with_field_ref(mut self, field: FieldRef, array: ArrayRef) -> Self {
+        self.fields.push(field);
+        self.arrays.push(array);
+        self
+    }
+
     /// Set the null buffer for this struct array.
     pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
         self.nulls = Some(nulls);
@@ -1202,7 +1225,19 @@ fn canonicalize_and_verify_data_type(data_type: 
&DataType) -> Result<Cow<'_, Dat
 }
 
 fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, 
Arc<Field>>> {
-    let Cow::Owned(new_data_type) = 
canonicalize_and_verify_data_type(field.data_type())? else {
+    let new_data_type = canonicalize_and_verify_data_type(field.data_type())?;
+
+    // A shredded FixedSizeBinary(16) column is always a UUID. Tag it with the 
UUID extension type
+    // on read, as a safety net against writers that emit the column without 
the extension metadata.
+    // Canonicalization never rewrites FixedSizeBinary(16), so the type is 
already correct here.
+    if matches!(new_data_type.as_ref(), DataType::FixedSizeBinary(16))
+        && !field.has_valid_extension_type::<UuidExtension>()
+    {
+        let new_field = 
field.as_ref().clone().with_extension_type(UuidExtension);
+        return Ok(Cow::Owned(Arc::new(new_field)));
+    }
+
+    let Cow::Owned(new_data_type) = new_data_type else {
         return Ok(Cow::Borrowed(field));
     };
     let new_field = field.as_ref().clone().with_data_type(new_data_type);
@@ -1216,9 +1251,9 @@ mod test {
 
     use super::*;
     use arrow::array::{
-        BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, 
Decimal128Array, Int32Array,
-        Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, 
ListArray, ListViewArray,
-        Time64MicrosecondArray,
+        BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, 
Decimal128Array,
+        FixedSizeBinaryArray, Int32Array, Int64Array, LargeBinaryArray, 
LargeListArray,
+        LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray,
     };
     use arrow::buffer::{OffsetBuffer, ScalarBuffer};
     use arrow_schema::{Field, Fields};
@@ -1329,6 +1364,41 @@ mod test {
             .build()
     }
 
+    #[test]
+    fn try_new_tags_untagged_uuid_on_read() {
+        // Simulate a foreign writer that shredded a UUID column as bare 
FixedSizeBinary(16),
+        // omitting the UUID extension type.
+        let typed_value = 
FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 2));
+        let input = 
make_variant_struct_with_typed_value(Arc::new(typed_value.unwrap()));
+
+        // try_new canonicalizes on the read path and attaches the extension.
+        let variant_array = VariantArray::try_new(&input).unwrap();
+        let typed_value = 
variant_array.inner().field_by_name("typed_value").unwrap();
+        assert_eq!(typed_value.data_type(), &DataType::FixedSizeBinary(16));
+        assert!(typed_value.has_valid_extension_type::<UuidExtension>());
+    }
+
+    #[test]
+    fn try_new_tags_untagged_nested_uuid_on_read() {
+        // A shredded object { id: { typed_value: FixedSizeBinary(16) } } 
whose inner UUID leaf
+        // carries no extension type; canonicalization must reach it 
recursively.
+        let leaf = 
FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 1)).unwrap();
+        let inner = StructArrayBuilder::new()
+            .with_field("typed_value", Arc::new(leaf), true)
+            .build();
+        let object = StructArrayBuilder::new()
+            .with_field("id", Arc::new(inner), false)
+            .build();
+        let input = make_variant_struct_with_typed_value(Arc::new(object));
+
+        // typed_value (struct) -> id (struct) -> typed_value (the 
FixedSizeBinary(16) UUID leaf).
+        let variant_array = VariantArray::try_new(&input).unwrap();
+        let object = variant_array.typed_value_field().unwrap().as_struct();
+        let id = object.column_by_name("id").unwrap().as_struct();
+        let uuid_leaf = id.field_by_name("typed_value").unwrap();
+        assert!(uuid_leaf.has_valid_extension_type::<UuidExtension>());
+    }
+
     #[test]
     fn all_null_shredding_state() {
         // Verify the shredding state is AllNull

Reply via email to