This is an automated email from the ASF dual-hosted git repository.
scovich pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new edd2c8eef5 support large string for unshred variant (#9515)
edd2c8eef5 is described below
commit edd2c8eef5a7b702947a25e3223539e3723d5aac
Author: Matthew Kim <[email protected]>
AuthorDate: Mon Mar 9 12:57:17 2026 -0400
support large string for unshred variant (#9515)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/9513
# Rationale for this change
`VariantArray::try_new` and `canonicalize_and_verify_data_type` both
accept `LargeUtf8` as a valid shredded variant type. However
unshred_variant currently only handles Utf8 for string typed_value
columns
This means a VariantArray with a LargeUtf8 typed_value column can be
constructed successfully, but calling unshred_variant on it fails
---
parquet-variant-compute/src/unshred_variant.rs | 44 ++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/parquet-variant-compute/src/unshred_variant.rs
b/parquet-variant-compute/src/unshred_variant.rs
index 3600662915..0fba53b315 100644
--- a/parquet-variant-compute/src/unshred_variant.rs
+++ b/parquet-variant-compute/src/unshred_variant.rs
@@ -20,8 +20,8 @@
use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
use arrow::array::{
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray,
FixedSizeListArray,
- GenericListArray, GenericListViewArray, ListLikeArray, PrimitiveArray,
StringArray,
- StructArray,
+ GenericListArray, GenericListViewArray, LargeStringArray, ListLikeArray,
PrimitiveArray,
+ StringArray, StructArray,
};
use arrow::buffer::NullBuffer;
use arrow::datatypes::{
@@ -105,6 +105,7 @@ enum UnshredVariantRowBuilder<'a> {
TimestampNanosecond(TimestampUnshredRowBuilder<'a,
TimestampNanosecondType>),
PrimitiveBoolean(UnshredPrimitiveRowBuilder<'a, BooleanArray>),
PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>),
+ PrimitiveLargeString(UnshredPrimitiveRowBuilder<'a, LargeStringArray>),
PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>),
PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>),
List(ListUnshredVariantBuilder<'a, GenericListArray<i32>>),
@@ -146,6 +147,7 @@ impl<'a> UnshredVariantRowBuilder<'a> {
Self::TimestampNanosecond(b) => b.append_row(builder, metadata,
index),
Self::PrimitiveBoolean(b) => b.append_row(builder, metadata,
index),
Self::PrimitiveString(b) => b.append_row(builder, metadata, index),
+ Self::PrimitiveLargeString(b) => b.append_row(builder, metadata,
index),
Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata,
index),
Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index),
Self::List(b) => b.append_row(builder, metadata, index),
@@ -226,6 +228,7 @@ impl<'a> UnshredVariantRowBuilder<'a> {
}
DataType::Boolean => primitive_builder!(PrimitiveBoolean,
as_boolean),
DataType::Utf8 => primitive_builder!(PrimitiveString, as_string),
+ DataType::LargeUtf8 => primitive_builder!(PrimitiveLargeString,
as_string),
DataType::BinaryView => primitive_builder!(PrimitiveBinaryView,
as_binary_view),
DataType::FixedSizeBinary(16) => {
primitive_builder!(PrimitiveUuid, as_fixed_size_binary)
@@ -405,6 +408,7 @@ macro_rules! impl_append_to_variant_builder {
impl_append_to_variant_builder!(BooleanArray);
impl_append_to_variant_builder!(StringArray);
+impl_append_to_variant_builder!(LargeStringArray);
impl_append_to_variant_builder!(BinaryViewArray);
impl_append_to_variant_builder!(PrimitiveArray<Int8Type>);
impl_append_to_variant_builder!(PrimitiveArray<Int16Type>);
@@ -666,3 +670,39 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a,
L> {
// TODO: This code is covered by tests in
`parquet/tests/variant_integration.rs`. Does that suffice?
// Or do we also need targeted stand-alone unit tests for full coverage?
+
+#[cfg(test)]
+mod tests {
+ use crate::VariantArray;
+ use arrow::array::{BinaryViewArray, LargeStringArray};
+ use parquet_variant::Variant;
+
+ #[test]
+ fn test_unshred_largeutf8_typed_value() {
+ let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
+ let metadata =
+ BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]);
+
+ let typed_value: arrow::array::ArrayRef = std::sync::Arc::new(
+ LargeStringArray::from(vec![
+ Some("hello"),
+ Some("middle"),
+ Some("world"),
+ ]),
+ );
+
+ let variant_array = VariantArray::from_parts(
+ metadata,
+ None,
+ Some(typed_value),
+ None,
+ );
+
+ let result = crate::unshred_variant(&variant_array).unwrap();
+
+ assert_eq!(result.len(), 3);
+ assert_eq!(result.value(0), Variant::from("hello"));
+ assert_eq!(result.value(1), Variant::from("middle"));
+ assert_eq!(result.value(2), Variant::from("world"));
+ }
+}