This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 481223f875 feat(parquet-variant): add Dictionary and REE
variant_to_arrow support (#10014)
481223f875 is described below
commit 481223f8750957d47d34990fe1ae2a1a5aa0515a
Author: Neetika Mittal <[email protected]>
AuthorDate: Tue Jun 9 18:50:21 2026 +0100
feat(parquet-variant): add Dictionary and REE variant_to_arrow support
(#10014)
# Which issue does this PR close?
- Closes #10013
- Related to #6736
# Rationale for this change
`variant_get` / `variant_to_arrow` can already convert Variant values
into many native Arrow array layouts, but requesting
`DataType::Dictionary` or `DataType::RunEndEncoded` was not supported.
This PR adds support for those output encodings without changing Variant
shredding semantics. `Dictionary` and `RunEndEncoded` are produced as
Arrow result arrays only; they are not introduced as valid Parquet
Variant shredded `typed_value` layouts.
# What changes are included in this PR?
1. Adds an encoded output builder in `variant_to_arrow` for
`DataType::Dictionary` and `DataType::RunEndEncoded`.
2. Builds the logical child value array using the existing
Variant-to-Arrow builders, then delegates the final Dictionary/REE
encoding to Arrow's existing cast kernels.
3. Adds `variant_get` regression coverage for string dictionary, numeric
dictionary, and run-end encoded outputs.
# Are these changes tested?
Yes:
- `cargo fmt --check`
- `cargo test -p parquet-variant-compute`
- `cargo test -p parquet-variant`
- `cargo clippy --workspace --all-targets`
# Are there any user-facing changes?
Yes. `variant_get` with `as_type` set to `DataType::Dictionary` or
`DataType::RunEndEncoded` can now return those Arrow array encodings.
Co-authored-by: Neetika Mittal <[email protected]>
---
parquet-variant-compute/src/variant_get.rs | 80 ++++++++++++++++++++++++-
parquet-variant-compute/src/variant_to_arrow.rs | 63 ++++++++++++++++++-
2 files changed, 141 insertions(+), 2 deletions(-)
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 38c577564d..c3e9159935 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -448,7 +448,7 @@ mod test {
Time64NanosecondArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
- use arrow::compute::CastOptions;
+ use arrow::compute::{CastOptions, cast};
use arrow::datatypes::DataType::{Int16, Int32, Int64};
use arrow::datatypes::i256;
use arrow::util::display::FormatOptions;
@@ -4223,6 +4223,84 @@ mod test {
}
}
+ #[test]
+ fn get_variant_as_dictionary() {
+ let variant_array: ArrayRef =
ArrayRef::from(VariantArray::from_iter(vec![
+ Some(Variant::from("apple")),
+ Some(Variant::from("banana")),
+ None,
+ Some(Variant::from("apple")),
+ ]));
+ let data_type = DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8));
+ let options =
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+ "dict",
+ data_type.clone(),
+ true,
+ ))));
+
+ let result = variant_get(&variant_array, options).unwrap();
+ assert_eq!(result.data_type(), &data_type);
+
+ let decoded = cast(result.as_ref(), &DataType::Utf8).unwrap();
+ let expected = StringArray::from(vec![Some("apple"), Some("banana"),
None, Some("apple")]);
+ assert_eq!(decoded.as_ref(), &expected);
+ }
+
+ #[test]
+ fn get_variant_as_numeric_dictionary() {
+ let variant_array: ArrayRef =
ArrayRef::from(VariantArray::from_iter(vec![
+ Some(Variant::from(42)),
+ Some(Variant::from(7)),
+ None,
+ Some(Variant::from(42)),
+ ]));
+ let data_type = DataType::Dictionary(Box::new(DataType::Int16),
Box::new(DataType::Int32));
+ let options =
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+ "dict",
+ data_type.clone(),
+ true,
+ ))));
+
+ let result = variant_get(&variant_array, options).unwrap();
+ assert_eq!(result.data_type(), &data_type);
+
+ let decoded = cast(result.as_ref(), &DataType::Int32).unwrap();
+ let expected = Int32Array::from(vec![Some(42), Some(7), None,
Some(42)]);
+ assert_eq!(decoded.as_ref(), &expected);
+ }
+
+ #[test]
+ fn get_variant_as_run_end_encoded() {
+ let variant_array: ArrayRef =
ArrayRef::from(VariantArray::from_iter(vec![
+ Some(Variant::from("apple")),
+ Some(Variant::from("apple")),
+ None,
+ Some(Variant::from("banana")),
+ Some(Variant::from("banana")),
+ ]));
+ let run_ends = Arc::new(Field::new("run_ends", DataType::Int32,
false));
+ let values = Arc::new(Field::new("values", DataType::Utf8, true));
+ let data_type = DataType::RunEndEncoded(run_ends, values);
+ let options =
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+ "ree",
+ data_type.clone(),
+ true,
+ ))));
+
+ let result = variant_get(&variant_array, options).unwrap();
+ assert_eq!(result.data_type(), &data_type);
+
+ let decoded = cast(result.as_ref(), &DataType::Utf8).unwrap();
+ let expected = StringArray::from(vec![
+ Some("apple"),
+ Some("apple"),
+ None,
+ Some("banana"),
+ Some("banana"),
+ ]);
+ assert_eq!(decoded.as_ref(), &expected);
+ }
+
fn invalid_time_variant_array() -> ArrayRef {
let mut builder = VariantArrayBuilder::new(3);
// 86401000000 is invalid for Time64Microsecond (max is 86400000000)
diff --git a/parquet-variant-compute/src/variant_to_arrow.rs
b/parquet-variant-compute/src/variant_to_arrow.rs
index ee6f1049ed..9841da555d 100644
--- a/parquet-variant-compute/src/variant_to_arrow.rs
+++ b/parquet-variant-compute/src/variant_to_arrow.rs
@@ -33,7 +33,7 @@ use arrow::array::{
StructArray,
};
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
-use arrow::compute::{CastOptions, DecimalCast};
+use arrow::compute::{CastOptions, DecimalCast, cast_with_options};
use arrow::datatypes::{self, DataType, DecimalType};
use arrow::error::{ArrowError, Result};
use arrow_schema::{FieldRef, Fields, TimeUnit};
@@ -48,6 +48,7 @@ pub(crate) enum VariantToArrowRowBuilder<'a> {
Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
Array(ArrayVariantToArrowRowBuilder<'a>),
Struct(StructVariantToArrowRowBuilder<'a>),
+ Encoded(EncodedVariantToArrowRowBuilder<'a>),
BinaryVariant(VariantToBinaryVariantArrowRowBuilder),
// Path extraction wrapper - contains a boxed enum for any of the above
@@ -61,6 +62,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
Primitive(b) => b.append_null(),
Array(b) => b.append_null(),
Struct(b) => b.append_null(),
+ Encoded(b) => b.append_null(),
BinaryVariant(b) => b.append_null(),
WithPath(path_builder) => path_builder.append_null(),
}
@@ -72,6 +74,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
Primitive(b) => b.append_value(&value),
Array(b) => b.append_value(&value),
Struct(b) => b.append_value(&value),
+ Encoded(b) => b.append_value(value),
BinaryVariant(b) => b.append_value(value),
WithPath(path_builder) => path_builder.append_value(value),
}
@@ -83,6 +86,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
Primitive(b) => b.finish(),
Array(b) => b.finish(),
Struct(b) => b.finish(),
+ Encoded(b) => b.finish(),
BinaryVariant(b) => b.finish(),
WithPath(path_builder) => path_builder.finish(),
}
@@ -110,6 +114,24 @@ fn make_typed_variant_to_arrow_row_builder<'a>(
ArrayVariantToArrowRowBuilder::try_new(data_type,
cast_options, capacity, false)?;
Ok(Array(builder))
}
+ DataType::Dictionary(_, value_type) => {
+ let builder = EncodedVariantToArrowRowBuilder::try_new(
+ data_type,
+ value_type.as_ref(),
+ cast_options,
+ capacity,
+ )?;
+ Ok(Encoded(builder))
+ }
+ DataType::RunEndEncoded(_, value_field) => {
+ let builder = EncodedVariantToArrowRowBuilder::try_new(
+ data_type,
+ value_field.data_type(),
+ cast_options,
+ capacity,
+ )?;
+ Ok(Encoded(builder))
+ }
data_type => {
let builder =
make_primitive_variant_to_arrow_row_builder(data_type,
cast_options, capacity)?;
@@ -331,6 +353,45 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
}
}
+pub(crate) struct EncodedVariantToArrowRowBuilder<'a> {
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions<'a>,
+ values_builder: Box<VariantToArrowRowBuilder<'a>>,
+}
+
+impl<'a> EncodedVariantToArrowRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ value_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ let values_builder = Box::new(make_typed_variant_to_arrow_row_builder(
+ value_type,
+ cast_options,
+ capacity,
+ )?);
+ Ok(Self {
+ data_type,
+ cast_options,
+ values_builder,
+ })
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.values_builder.append_null()
+ }
+
+ fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
+ self.values_builder.append_value(value)
+ }
+
+ fn finish(self) -> Result<ArrayRef> {
+ let values = self.values_builder.finish()?;
+ cast_with_options(values.as_ref(), self.data_type, self.cast_options)
+ }
+}
+
/// Creates a row builder that converts primitive `Variant` values into the
requested Arrow data type.
pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
data_type: &'a DataType,