This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1ba5d486a4 [Variant] Preserve `UUID` extension type metadata for
Parquet writer (#10015)
1ba5d486a4 is described below
commit 1ba5d486a45b234d5bdc5e91850c241d5a7bccec
Author: Konstantin Tarasov <[email protected]>
AuthorDate: Thu Jun 11 17:17:42 2026 -0400
[Variant] Preserve `UUID` extension type metadata for Parquet writer
(#10015)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes ##8420.
# Rationale for this change
Shredding into `FixedSizeBinary(16)` means we're shredding into `UUID`
Parquet logical type. `shred_variant` currently doesn't preserve
extension type metadata for the typed value field.
UUID is the only valid `Variant` shredding type that requires an arrow
extension type.
https://github.com/apache/parquet-format/blob/master/VariantShredding.md
Earlier in
[#](https://github.com/apache/arrow-rs/issues/8665#issuecomment-3423325006)
@scovich mentioned:
> Yeah, as long as `shred_variant` only takes a `DataType` instead of a
`Field`, we are forced to assume 16-byte fixed binary is UUID. If it
accepted a `Field`, we should additionally require the UUID extension
type. Otherwise, we potentially run into problems because Decimal128 can
_also_ use 16-byte fixed binary!
This is an argument proposing to use `Field` instead of `DataType` for
`as_type` parameter in `shred_variant`. This should not be an issue
because arrow has a `Decimal128Type` to represent `Decimal128` logical
Parquet type. This way there's no ambiguity in using
`FixedSizeBinary(16)` arrow type to represent `UUID`. Switching
`as_type` to `Field` is unnecessary.
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
- `VariantArray::from_parts/ShreddedVariantFieldArray::from_parts` now
add `UUID` extension type metadata to the typed_value `Field` if
`DataType` is `FixedSizeBinary(16)`
- Uncommented `UUID` extension part metadata validation in a unit test.
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
- Yes, unit test.
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
If this PR claims a performance improvement, please include evidence
such as benchmark results.
-->
# Are there any user-facing changes?
- Shredded `UUID` typed value fields now preserve `UUID` extension type
metadata.
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---------
Co-authored-by: Ryan Johnson <[email protected]>
---
parquet-variant-compute/src/shred_variant.rs | 42 +++++++++-----
parquet-variant-compute/src/variant_array.rs | 84 +++++++++++++++++++++++++---
2 files changed, 106 insertions(+), 20 deletions(-)
diff --git a/parquet-variant-compute/src/shred_variant.rs
b/parquet-variant-compute/src/shred_variant.rs
index 440f4b7165..8f3963cce0 100644
--- a/parquet-variant-compute/src/shred_variant.rs
+++ b/parquet-variant-compute/src/shred_variant.rs
@@ -1189,19 +1189,9 @@ mod tests {
let variant_array = shred_variant(&input,
&DataType::FixedSizeBinary(16)).unwrap();
- // // inspect the typed_value Field and make sure it contains the
canonical Uuid extension type
- // let typed_value_field = variant_array
- // .inner()
- // .fields()
- // .into_iter()
- // .find(|f| f.name() == "typed_value")
- // .unwrap();
-
- // assert!(
- // typed_value_field
- // .try_extension_type::<extension::Uuid>()
- // .is_ok()
- // );
+ let typed_value_field =
variant_array.inner().field_by_name("typed_value").unwrap();
+
+
assert!(typed_value_field.has_valid_extension_type::<arrow_schema::extension::Uuid>());
// probe the downcasted typed_value array to make sure uuids are
shredded correctly
let uuids = variant_array
@@ -1227,6 +1217,32 @@ mod tests {
assert_eq!(got_uuid_2, mock_uuid_2.as_bytes());
}
+ #[test]
+ fn test_uuid_nested_shredding() {
+ let mock_uuid = Uuid::new_v4();
+ let input = build_variant_array(vec![VariantRow::Object(vec![(
+ "id",
+ VariantValue::from(mock_uuid),
+ )])]);
+ let target = ShreddedSchemaBuilder::default()
+ .with_path("id", DataType::FixedSizeBinary(16))
+ .unwrap()
+ .build();
+
+ let result = shred_variant(&input, &target).unwrap();
+
+ let typed_value = result.typed_value_field().unwrap();
+ let typed_struct =
typed_value.as_any().downcast_ref::<StructArray>().unwrap();
+ let id =
+
ShreddedVariantFieldArray::try_new(typed_struct.column_by_name("id").unwrap()).unwrap();
+
+ // The extension type lives on the field, not the array, so assert it
on the inner struct.
+ let leaf = id.inner().field_by_name("typed_value").unwrap();
+
+ assert_eq!(leaf.data_type(), &DataType::FixedSizeBinary(16));
+
assert!(leaf.has_valid_extension_type::<arrow_schema::extension::Uuid>());
+ }
+
#[test]
fn test_primitive_shredding_comprehensive() {
// Test mixed scenarios in a single array
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index a6ee281002..d773fe475d 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -31,7 +31,7 @@ use arrow::datatypes::{
TimestampMicrosecondType, TimestampNanosecondType,
};
use arrow::error::Result;
-use arrow_schema::extension::ExtensionType;
+use arrow_schema::extension::{ExtensionType, Uuid as UuidExtension};
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
use chrono::{DateTime, NaiveTime};
use parquet_variant::{
@@ -327,7 +327,7 @@ impl VariantArray {
builder = builder.with_field("value", value, true);
}
if let Some(typed_value) = typed_value.clone() {
- builder = builder.with_field("typed_value", typed_value, true);
+ builder = builder.with_field_ref(typed_value_field(&typed_value),
typed_value);
}
if let Some(nulls) = nulls {
builder = builder.with_nulls(nulls);
@@ -713,7 +713,7 @@ impl ShreddedVariantFieldArray {
builder = builder.with_field("value", value, true);
}
if let Some(typed_value) = typed_value.clone() {
- builder = builder.with_field("typed_value", typed_value, true);
+ builder = builder.with_field_ref(typed_value_field(&typed_value),
typed_value);
}
if let Some(nulls) = nulls {
builder = builder.with_nulls(nulls);
@@ -868,6 +868,19 @@ impl TryFrom<&StructArray> for ShreddingState {
}
}
+/// Build the `typed_value` [`FieldRef`] for a shredded column.
+///
+/// The Variant spec maps `FixedSizeBinary(16)` exclusively to UUID, so any
+/// shredded column of that type must carry the canonical [`UuidExtension`]
+/// extension metadata on its field.
+fn typed_value_field(array: &ArrayRef) -> FieldRef {
+ let mut field = Field::new("typed_value", array.data_type().clone(), true);
+ if matches!(array.data_type(), DataType::FixedSizeBinary(16)) {
+ field = field.with_extension_type(UuidExtension);
+ }
+ Arc::new(field)
+}
+
/// Builds struct arrays from component fields
///
/// TODO: move to arrow crate
@@ -891,6 +904,16 @@ impl StructArrayBuilder {
self
}
+ /// Add an array to this struct array using a caller-supplied [`FieldRef`].
+ ///
+ /// Use this when the field carries metadata (e.g. an extension type) that
+ /// would be lost if the field were synthesized from the array's data type
alone.
+ pub fn with_field_ref(mut self, field: FieldRef, array: ArrayRef) -> Self {
+ self.fields.push(field);
+ self.arrays.push(array);
+ self
+ }
+
/// Set the null buffer for this struct array.
pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
self.nulls = Some(nulls);
@@ -1202,7 +1225,19 @@ fn canonicalize_and_verify_data_type(data_type:
&DataType) -> Result<Cow<'_, Dat
}
fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_,
Arc<Field>>> {
- let Cow::Owned(new_data_type) =
canonicalize_and_verify_data_type(field.data_type())? else {
+ let new_data_type = canonicalize_and_verify_data_type(field.data_type())?;
+
+ // A shredded FixedSizeBinary(16) column is always a UUID. Tag it with the
UUID extension type
+ // on read, as a safety net against writers that emit the column without
the extension metadata.
+ // Canonicalization never rewrites FixedSizeBinary(16), so the type is
already correct here.
+ if matches!(new_data_type.as_ref(), DataType::FixedSizeBinary(16))
+ && !field.has_valid_extension_type::<UuidExtension>()
+ {
+ let new_field =
field.as_ref().clone().with_extension_type(UuidExtension);
+ return Ok(Cow::Owned(Arc::new(new_field)));
+ }
+
+ let Cow::Owned(new_data_type) = new_data_type else {
return Ok(Cow::Borrowed(field));
};
let new_field = field.as_ref().clone().with_data_type(new_data_type);
@@ -1216,9 +1251,9 @@ mod test {
use super::*;
use arrow::array::{
- BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array,
Decimal128Array, Int32Array,
- Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray,
ListArray, ListViewArray,
- Time64MicrosecondArray,
+ BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array,
Decimal128Array,
+ FixedSizeBinaryArray, Int32Array, Int64Array, LargeBinaryArray,
LargeListArray,
+ LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray,
};
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
use arrow_schema::{Field, Fields};
@@ -1329,6 +1364,41 @@ mod test {
.build()
}
+ #[test]
+ fn try_new_tags_untagged_uuid_on_read() {
+ // Simulate a foreign writer that shredded a UUID column as bare
FixedSizeBinary(16),
+ // omitting the UUID extension type.
+ let typed_value =
FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 2));
+ let input =
make_variant_struct_with_typed_value(Arc::new(typed_value.unwrap()));
+
+ // try_new canonicalizes on the read path and attaches the extension.
+ let variant_array = VariantArray::try_new(&input).unwrap();
+ let typed_value =
variant_array.inner().field_by_name("typed_value").unwrap();
+ assert_eq!(typed_value.data_type(), &DataType::FixedSizeBinary(16));
+ assert!(typed_value.has_valid_extension_type::<UuidExtension>());
+ }
+
+ #[test]
+ fn try_new_tags_untagged_nested_uuid_on_read() {
+ // A shredded object { id: { typed_value: FixedSizeBinary(16) } }
whose inner UUID leaf
+ // carries no extension type; canonicalization must reach it
recursively.
+ let leaf =
FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 1)).unwrap();
+ let inner = StructArrayBuilder::new()
+ .with_field("typed_value", Arc::new(leaf), true)
+ .build();
+ let object = StructArrayBuilder::new()
+ .with_field("id", Arc::new(inner), false)
+ .build();
+ let input = make_variant_struct_with_typed_value(Arc::new(object));
+
+ // typed_value (struct) -> id (struct) -> typed_value (the
FixedSizeBinary(16) UUID leaf).
+ let variant_array = VariantArray::try_new(&input).unwrap();
+ let object = variant_array.typed_value_field().unwrap().as_struct();
+ let id = object.column_by_name("id").unwrap().as_struct();
+ let uuid_leaf = id.field_by_name("typed_value").unwrap();
+ assert!(uuid_leaf.has_valid_extension_type::<UuidExtension>());
+ }
+
#[test]
fn all_null_shredding_state() {
// Verify the shredding state is AllNull