This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 0936b38d3d [Variant] Fix `variant_get` to return `List<T>` instead of
`List<Struct>` (#9631)
0936b38d3d is described below
commit 0936b38d3d3db962af782b368b88ee81fe8e49d5
Author: Liam Bao <[email protected]>
AuthorDate: Mon Apr 6 15:34:39 2026 -0400
[Variant] Fix `variant_get` to return `List<T>` instead of `List<Struct>`
(#9631)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9615.
# Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
`variant_get(..., List<T>)` was returning `List<Struct<value,
typed_value>>` instead of `List<T>`. This happened because
`VariantToListArrowRowBuilder` unconditionally used
`make_variant_to_shredded_variant_arrow_row_builder` for its element
builder, which produces the shredded representation (a struct with
`value` and `typed_value` fields). This is correct for `shred_variant`,
but `variant_get` should produce strongly typed arrays directly.
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
- Introduced a `ListElementBuilder` enum with `Typed` and `Shredded`
variants to abstract over the two element output modes.
- `Typed` wraps `VariantToArrowRowBuilder` and produces the target type
directly (e.g., `Int64Array`).
- `Shredded` wraps `VariantToShreddedVariantRowBuilder` and produces the
`Struct<value, typed_value>` used by shredding.
- Added a `shredded: bool` parameter to
`ArrayVariantToArrowRowBuilder::try_new` and
`VariantToListArrowRowBuilder::try_new` to select the appropriate mode.
# Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
Yes
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
Yes, the `variant_get(..., List<T>)` should have correct behavior now
---
parquet-variant-compute/src/shred_variant.rs | 1 +
parquet-variant-compute/src/variant_get.rs | 137 ++++++++++++------------
parquet-variant-compute/src/variant_to_arrow.rs | 75 ++++++++++---
3 files changed, 130 insertions(+), 83 deletions(-)
diff --git a/parquet-variant-compute/src/shred_variant.rs
b/parquet-variant-compute/src/shred_variant.rs
index d80d2f9863..d2021e12f6 100644
--- a/parquet-variant-compute/src/shred_variant.rs
+++ b/parquet-variant-compute/src/shred_variant.rs
@@ -303,6 +303,7 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
data_type,
cast_options,
capacity,
+ true,
)?,
nulls: NullBufferBuilder::new(capacity),
null_value,
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 73906f70eb..896900cc5d 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -365,8 +365,8 @@ mod test {
use arrow_schema::{DataType, Field, FieldRef, Fields, IntervalUnit,
TimeUnit};
use chrono::DateTime;
use parquet_variant::{
- EMPTY_VARIANT_METADATA_BYTES, Variant, VariantBuilder,
VariantDecimal4, VariantDecimal8,
- VariantDecimal16, VariantDecimalType, VariantPath,
+ EMPTY_VARIANT_METADATA_BYTES, Variant, VariantDecimal4,
VariantDecimal8, VariantDecimal16,
+ VariantDecimalType, VariantPath,
};
fn single_variant_get_test(input_json: &str, path: VariantPath,
expected_json: &str) {
@@ -4131,69 +4131,45 @@ mod test {
]));
let variant_array =
ArrayRef::from(json_to_variant(&string_array).unwrap());
- let value_array: ArrayRef = {
- let mut builder = VariantBuilder::new();
- builder.append_value("two");
- let (_, value_bytes) = builder.finish();
- Arc::new(BinaryViewArray::from(vec![
- None,
- Some(value_bytes.as_slice()),
- None,
- ]))
- };
- let typed_value_array: ArrayRef =
Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
- let struct_fields = Fields::from(vec![
- Field::new("value", DataType::BinaryView, true),
- Field::new("typed_value", DataType::Int64, true),
- ]);
- let struct_array: ArrayRef = Arc::new(
- StructArray::try_new(
- struct_fields.clone(),
- vec![value_array.clone(), typed_value_array.clone()],
- None,
- )
- .unwrap(),
- );
-
- let request_field = Arc::new(Field::new("item", DataType::Int64,
true));
- let result_field = Arc::new(Field::new("item",
DataType::Struct(struct_fields), true));
+ let element_array: ArrayRef = Arc::new(Int64Array::from(vec![Some(1),
None, Some(3)]));
+ let field = Arc::new(Field::new("item", Int64, true));
let expectations = vec![
(
- DataType::List(request_field.clone()),
+ DataType::List(field.clone()),
Arc::new(ListArray::new(
- result_field.clone(),
+ field.clone(),
OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 3])),
- struct_array.clone(),
+ element_array.clone(),
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
(
- DataType::LargeList(request_field.clone()),
+ DataType::LargeList(field.clone()),
Arc::new(LargeListArray::new(
- result_field.clone(),
+ field.clone(),
OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 3])),
- struct_array.clone(),
+ element_array.clone(),
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
(
- DataType::ListView(request_field.clone()),
+ DataType::ListView(field.clone()),
Arc::new(ListViewArray::new(
- result_field.clone(),
+ field.clone(),
ScalarBuffer::from(vec![0, 3]),
ScalarBuffer::from(vec![3, 0]),
- struct_array.clone(),
+ element_array.clone(),
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
(
- DataType::LargeListView(request_field),
+ DataType::LargeListView(field.clone()),
Arc::new(LargeListViewArray::new(
- result_field,
+ field,
ScalarBuffer::from(vec![0, 3]),
ScalarBuffer::from(vec![3, 0]),
- struct_array,
+ element_array,
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
@@ -4235,6 +4211,52 @@ mod test {
}
}
+ #[test]
+ fn test_variant_get_nested_list() {
+ use arrow::datatypes::Int64Type;
+
+ let string_array: ArrayRef = Arc::new(StringArray::from(vec![
+ r#"[[1, 2], [3]]"#,
+ r#"[[4], "not a list", [5, 6]]"#,
+ ]));
+ let variant_array =
ArrayRef::from(json_to_variant(&string_array).unwrap());
+
+ let inner_field = Arc::new(Field::new("item", Int64, true));
+ let outer_field = Arc::new(Field::new(
+ "item",
+ DataType::List(inner_field.clone()),
+ true,
+ ));
+ let request_type = DataType::List(outer_field.clone());
+
+ let options =
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+ "result",
+ request_type,
+ true,
+ ))));
+ let result = variant_get(&variant_array, options).unwrap();
+ let outer = result.as_list::<i32>();
+
+ // Row 0: [[1, 2], [3]]
+ let row0 = outer.value(0);
+ let row0 = row0.as_list::<i32>();
+ assert_eq!(row0.len(), 2);
+ let elem0 = row0.value(0);
+ assert_eq!(elem0.as_primitive::<Int64Type>().values(), &[1, 2]);
+ let elem1 = row0.value(1);
+ assert_eq!(elem1.as_primitive::<Int64Type>().values(), &[3]);
+
+ // Row 1: [[4], null, [5, 6]] — "not a list" becomes null inner list
+ let row1 = outer.value(1);
+ let row1 = row1.as_list::<i32>();
+ assert_eq!(row1.len(), 3);
+ let elem0 = row1.value(0);
+ assert_eq!(elem0.as_primitive::<Int64Type>().values(), &[4]);
+ assert!(row1.is_null(1));
+ let elem2 = row1.value(2);
+ assert_eq!(elem2.as_primitive::<Int64Type>().values(), &[5, 6]);
+ }
+
#[test]
fn test_variant_get_list_like_unsafe_cast_errors_on_element_mismatch() {
let string_array: ArrayRef =
@@ -4287,40 +4309,17 @@ mod test {
.with_cast_options(cast_options);
let result = variant_get(&variant_array, options).unwrap();
- let element_struct = result
- .as_any()
- .downcast_ref::<ListArray>()
- .unwrap()
+ let list_array = result.as_any().downcast_ref::<ListArray>().unwrap();
+ let values = list_array
.values()
.as_any()
- .downcast_ref::<StructArray>()
- .unwrap();
-
- let value = element_struct
- .column_by_name("value")
- .unwrap()
- .as_any()
- .downcast_ref::<BinaryViewArray>()
- .unwrap();
- let typed_value = element_struct
- .column_by_name("typed_value")
- .unwrap()
- .as_any()
.downcast_ref::<Int64Array>()
.unwrap();
- assert_eq!(typed_value.len(), 3);
- assert_eq!(typed_value.value(0), 1);
- assert!(typed_value.is_null(1));
- assert_eq!(typed_value.value(2), 3);
-
- assert!(value.is_null(0));
- assert!(value.is_valid(1));
- assert_eq!(
- Variant::new(EMPTY_VARIANT_METADATA_BYTES, value.value(1)),
- Variant::Null
- );
- assert!(value.is_null(2));
+ assert_eq!(values.len(), 3);
+ assert_eq!(values.value(0), 1);
+ assert!(values.is_null(1));
+ assert_eq!(values.value(2), 3);
}
#[test]
diff --git a/parquet-variant-compute/src/variant_to_arrow.rs
b/parquet-variant-compute/src/variant_to_arrow.rs
index dd396117d2..dd054a5f7d 100644
--- a/parquet-variant-compute/src/variant_to_arrow.rs
+++ b/parquet-variant-compute/src/variant_to_arrow.rs
@@ -107,7 +107,7 @@ fn make_typed_variant_to_arrow_row_builder<'a>(
| DataType::LargeListView(_)
| DataType::FixedSizeList(..)) => {
let builder =
- ArrayVariantToArrowRowBuilder::try_new(data_type,
cast_options, capacity)?;
+ ArrayVariantToArrowRowBuilder::try_new(data_type,
cast_options, capacity, false)?;
Ok(Array(builder))
}
data_type => {
@@ -587,10 +587,17 @@ impl<'a> StructVariantToArrowRowBuilder<'a> {
}
impl<'a> ArrayVariantToArrowRowBuilder<'a> {
+ /// Creates a new list builder for the given data type.
+ ///
+ /// # Arguments
+ /// * `shredded` - If true, element builders produce shredded structs with
`value`/`typed_value`
+ /// fields (for [`crate::shred_variant()`]). If false, element builders
produce strongly typed
+ /// arrays directly (for [`crate::variant_get()`]).
pub(crate) fn try_new(
data_type: &'a DataType,
cast_options: &'a CastOptions,
capacity: usize,
+ shredded: bool,
) -> Result<Self> {
use ArrayVariantToArrowRowBuilder::*;
@@ -602,6 +609,7 @@ impl<'a> ArrayVariantToArrowRowBuilder<'a> {
$field.data_type(),
cast_options,
capacity,
+ shredded,
)?)
};
}
@@ -893,13 +901,45 @@ impl<'a> VariantToUuidArrowRowBuilder<'a> {
}
}
+/// Element builder for list variants, supporting both typed (for
[`crate::variant_get()`])
+/// and shredded (for [`crate::shred_variant()`]) output modes.
+enum ListElementBuilder<'a> {
+ /// Produces the target array type directly.
+ Typed(Box<VariantToArrowRowBuilder<'a>>),
+ /// Produces a shredded struct with `value` and `typed_value` fields.
+ Shredded(Box<VariantToShreddedVariantRowBuilder<'a>>),
+}
+
+impl<'a> ListElementBuilder<'a> {
+ fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
+ match self {
+ Self::Typed(b) => b.append_value(value),
+ Self::Shredded(b) => b.append_value(value),
+ }
+ }
+
+ fn finish(self) -> Result<ArrayRef> {
+ match self {
+ Self::Typed(b) => b.finish(),
+ Self::Shredded(b) => {
+ let (value, typed_value, nulls) = b.finish()?;
+ Ok(ArrayRef::from(ShreddedVariantFieldArray::from_parts(
+ Some(value),
+ Some(typed_value),
+ nulls,
+ )))
+ }
+ }
+ }
+}
+
pub(crate) struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool>
where
O: OffsetSizeTrait + ArrowNativeTypeOp,
{
field: FieldRef,
offsets: Vec<O>,
- element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
+ element_builder: ListElementBuilder<'a>,
nulls: NullBufferBuilder,
current_offset: O,
cast_options: &'a CastOptions<'a>,
@@ -914,6 +954,7 @@ where
element_data_type: &'a DataType,
cast_options: &'a CastOptions,
capacity: usize,
+ shredded: bool,
) -> Result<Self> {
if capacity >= isize::MAX as usize {
return Err(ArrowError::ComputeError(
@@ -922,16 +963,24 @@ where
}
let mut offsets = Vec::with_capacity(capacity + 1);
offsets.push(O::ZERO);
- let element_builder =
make_variant_to_shredded_variant_arrow_row_builder(
- element_data_type,
- cast_options,
- capacity,
- NullValue::ArrayElement,
- )?;
+ let element_builder = if shredded {
+ let builder = make_variant_to_shredded_variant_arrow_row_builder(
+ element_data_type,
+ cast_options,
+ capacity,
+ NullValue::ArrayElement,
+ )?;
+ ListElementBuilder::Shredded(Box::new(builder))
+ } else {
+ let builder =
+ make_typed_variant_to_arrow_row_builder(element_data_type,
cast_options, capacity)?;
+ ListElementBuilder::Typed(Box::new(builder))
+ };
+
Ok(Self {
field,
offsets,
- element_builder: Box::new(element_builder),
+ element_builder,
nulls: NullBufferBuilder::new(capacity),
current_offset: O::ZERO,
cast_options,
@@ -966,9 +1015,7 @@ where
}
fn finish(mut self) -> Result<ArrayRef> {
- let (value, typed_value, nulls) = self.element_builder.finish()?;
- let element_array =
- ShreddedVariantFieldArray::from_parts(Some(value),
Some(typed_value), nulls);
+ let element_array: ArrayRef = self.element_builder.finish()?;
let field = Arc::new(
self.field
.as_ref()
@@ -987,7 +1034,7 @@ where
field,
ScalarBuffer::from(self.offsets),
ScalarBuffer::from(sizes),
- ArrayRef::from(element_array),
+ element_array,
self.nulls.finish(),
);
Ok(Arc::new(list_view_array))
@@ -995,7 +1042,7 @@ where
let list_array = GenericListArray::<O>::new(
field,
OffsetBuffer::<O>::new(ScalarBuffer::from(self.offsets)),
- ArrayRef::from(element_array),
+ element_array,
self.nulls.finish(),
);
Ok(Arc::new(list_array))