This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new f248da3cc3 [VARIANT] Add support for DataType::Struct for
cast_to_variant (#8090)
f248da3cc3 is described below
commit f248da3cc39161af436b6337b2ae836168d13abe
Author: Aditya Bhatnagar <[email protected]>
AuthorDate: Thu Aug 14 13:41:30 2025 -0400
[VARIANT] Add support for DataType::Struct for cast_to_variant (#8090)
# Which issue does this PR close?
- Closes #8061
# Rationale for this change
Add support for DataType::Struct for cast_to_variant
# What changes are included in this PR?
Adds support for casting and adds tests as well
# Are there any user-facing changes?
yes casting to variant is a user facing issue
Props to @mprammer!!
---
parquet-variant-compute/src/cast_to_variant.rs | 339 ++++++++++++++++++++++++-
1 file changed, 334 insertions(+), 5 deletions(-)
diff --git a/parquet-variant-compute/src/cast_to_variant.rs
b/parquet-variant-compute/src/cast_to_variant.rs
index 343d387b24..2df53a501e 100644
--- a/parquet-variant-compute/src/cast_to_variant.rs
+++ b/parquet-variant-compute/src/cast_to_variant.rs
@@ -34,7 +34,9 @@ use arrow::temporal_conversions::{
use arrow_schema::{ArrowError, DataType, TimeUnit};
use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
use half::f16;
-use parquet_variant::{Variant, VariantDecimal16, VariantDecimal4,
VariantDecimal8};
+use parquet_variant::{
+ Variant, VariantBuilder, VariantDecimal16, VariantDecimal4,
VariantDecimal8,
+};
/// Convert the input array of a specific primitive type to a `VariantArray`
/// row by row
@@ -367,6 +369,51 @@ pub fn cast_to_variant(input: &dyn Array) ->
Result<VariantArray, ArrowError> {
DataType::Utf8View => {
cast_conversion_nongeneric!(as_string_view, |v| v, input, builder);
}
+ DataType::Struct(_) => {
+ let struct_array = input.as_struct();
+
+ // Pre-convert all field arrays once for better performance
+ // This avoids converting the same field array multiple times
+ // Alternative approach: Use slicing per row: field_array.slice(i,
1)
+ // However, pre-conversion is more efficient for typical use cases
+ let field_variant_arrays: Result<Vec<_>, _> = struct_array
+ .columns()
+ .iter()
+ .map(|field_array| cast_to_variant(field_array.as_ref()))
+ .collect();
+ let field_variant_arrays = field_variant_arrays?;
+
+ // Cache column names to avoid repeated calls
+ let column_names = struct_array.column_names();
+
+ for i in 0..struct_array.len() {
+ if struct_array.is_null(i) {
+ builder.append_null();
+ continue;
+ }
+
+ // Create a VariantBuilder for this struct instance
+ let mut variant_builder = VariantBuilder::new();
+ let mut object_builder = variant_builder.new_object();
+
+ // Iterate through all fields in the struct
+ for (field_idx, field_name) in column_names.iter().enumerate()
{
+ // Use pre-converted field variant arrays for better
performance
+ // Check nulls directly from the pre-converted arrays
instead of accessing column again
+ if !field_variant_arrays[field_idx].is_null(i) {
+ let field_variant =
field_variant_arrays[field_idx].value(i);
+ object_builder.insert(field_name, field_variant);
+ }
+ // Note: we skip null fields rather than inserting
Variant::Null
+ // to match Arrow struct semantics where null fields are
omitted
+ }
+
+ object_builder.finish()?;
+ let (metadata, value) = variant_builder.finish();
+ let variant = Variant::try_new(&metadata, &value)?;
+ builder.append_variant(variant);
+ }
+ }
dt => {
return Err(ArrowError::CastError(format!(
"Unsupported data type for casting to Variant: {dt:?}",
@@ -384,12 +431,14 @@ pub fn cast_to_variant(input: &dyn Array) ->
Result<VariantArray, ArrowError> {
mod tests {
use super::*;
use arrow::array::{
- ArrayRef, BooleanArray, Decimal128Array, Decimal256Array,
Decimal32Array, Decimal64Array,
- FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array,
GenericByteBuilder,
- GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array,
+ ArrayRef, BinaryArray, BooleanArray, Decimal128Array, Decimal256Array,
Decimal32Array,
+ Decimal64Array, FixedSizeBinaryBuilder, Float16Array, Float32Array,
Float64Array,
+ GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array,
Int64Array, Int8Array,
IntervalYearMonthArray, LargeStringArray, NullArray, StringArray,
StringViewArray,
- UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+ StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
+ use arrow::buffer::NullBuffer;
+ use arrow_schema::{Field, Fields};
use arrow_schema::{
DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION,
DECIMAL64_MAX_PRECISION,
};
@@ -1286,6 +1335,286 @@ mod tests {
);
}
+ #[test]
+ fn test_cast_to_variant_struct() {
+ // Test a simple struct with two fields: id (int64) and age (int32)
+ let id_array = Int64Array::from(vec![Some(1001), Some(1002), None,
Some(1003)]);
+ let age_array = Int32Array::from(vec![Some(25), Some(30), Some(35),
None]);
+
+ let fields = Fields::from(vec![
+ Field::new("id", DataType::Int64, true),
+ Field::new("age", DataType::Int32, true),
+ ]);
+
+ let struct_array = StructArray::new(
+ fields,
+ vec![Arc::new(id_array), Arc::new(age_array)],
+ None, // no nulls at the struct level
+ );
+
+ let result = cast_to_variant(&struct_array).unwrap();
+ assert_eq!(result.len(), 4);
+
+ // Check first row: {"id": 1001, "age": 25}
+ let variant1 = result.value(0);
+ let obj1 = variant1.as_object().unwrap();
+ assert_eq!(obj1.get("id"), Some(Variant::from(1001i64)));
+ assert_eq!(obj1.get("age"), Some(Variant::from(25i32)));
+
+ // Check second row: {"id": 1002, "age": 30}
+ let variant2 = result.value(1);
+ let obj2 = variant2.as_object().unwrap();
+ assert_eq!(obj2.get("id"), Some(Variant::from(1002i64)));
+ assert_eq!(obj2.get("age"), Some(Variant::from(30i32)));
+
+ // Check third row: {"age": 35} (id is null, so omitted)
+ let variant3 = result.value(2);
+ let obj3 = variant3.as_object().unwrap();
+ assert_eq!(obj3.get("id"), None);
+ assert_eq!(obj3.get("age"), Some(Variant::from(35i32)));
+
+ // Check fourth row: {"id": 1003} (age is null, so omitted)
+ let variant4 = result.value(3);
+ let obj4 = variant4.as_object().unwrap();
+ assert_eq!(obj4.get("id"), Some(Variant::from(1003i64)));
+ assert_eq!(obj4.get("age"), None);
+ }
+
+ #[test]
+ fn test_cast_to_variant_struct_with_nulls() {
+ // Test struct with null values at the struct level
+ let id_array = Int64Array::from(vec![Some(1001), Some(1002)]);
+ let age_array = Int32Array::from(vec![Some(25), Some(30)]);
+
+ let fields = Fields::from(vec![
+ Field::new("id", DataType::Int64, false),
+ Field::new("age", DataType::Int32, false),
+ ]);
+
+ // Create null buffer to make second row null
+ let null_buffer = NullBuffer::from(vec![true, false]);
+
+ let struct_array = StructArray::new(
+ fields,
+ vec![Arc::new(id_array), Arc::new(age_array)],
+ Some(null_buffer),
+ );
+
+ let result = cast_to_variant(&struct_array).unwrap();
+ assert_eq!(result.len(), 2);
+
+ // Check first row: {"id": 1001, "age": 25}
+ assert!(!result.is_null(0));
+ let variant1 = result.value(0);
+ let obj1 = variant1.as_object().unwrap();
+ assert_eq!(obj1.get("id"), Some(Variant::from(1001i64)));
+ assert_eq!(obj1.get("age"), Some(Variant::from(25i32)));
+
+ // Check second row: null struct
+ assert!(result.is_null(1));
+ }
+
+ #[test]
+ fn test_cast_to_variant_struct_performance() {
+ // Test with a larger struct to demonstrate performance optimization
+ // This test ensures that field arrays are only converted once, not
per row
+ let size = 1000;
+
+ let id_array = Int64Array::from((0..size).map(|i| Some(i as
i64)).collect::<Vec<_>>());
+ let age_array = Int32Array::from(
+ (0..size)
+ .map(|i| Some((i % 100) as i32))
+ .collect::<Vec<_>>(),
+ );
+ let score_array =
+ Float64Array::from((0..size).map(|i| Some(i as f64 *
0.1)).collect::<Vec<_>>());
+
+ let fields = Fields::from(vec![
+ Field::new("id", DataType::Int64, false),
+ Field::new("age", DataType::Int32, false),
+ Field::new("score", DataType::Float64, false),
+ ]);
+
+ let struct_array = StructArray::new(
+ fields,
+ vec![
+ Arc::new(id_array),
+ Arc::new(age_array),
+ Arc::new(score_array),
+ ],
+ None,
+ );
+
+ let result = cast_to_variant(&struct_array).unwrap();
+ assert_eq!(result.len(), size);
+
+ // Verify a few sample rows
+ let variant0 = result.value(0);
+ let obj0 = variant0.as_object().unwrap();
+ assert_eq!(obj0.get("id"), Some(Variant::from(0i64)));
+ assert_eq!(obj0.get("age"), Some(Variant::from(0i32)));
+ assert_eq!(obj0.get("score"), Some(Variant::from(0.0f64)));
+
+ let variant999 = result.value(999);
+ let obj999 = variant999.as_object().unwrap();
+ assert_eq!(obj999.get("id"), Some(Variant::from(999i64)));
+ assert_eq!(obj999.get("age"), Some(Variant::from(99i32))); // 999 %
100 = 99
+ assert_eq!(obj999.get("score"), Some(Variant::from(99.9f64)));
+ }
+
+ #[test]
+ fn test_cast_to_variant_struct_performance_large() {
+ // Test with even larger struct and more fields to demonstrate
optimization benefits
+ let size = 10000;
+ let num_fields = 10;
+
+ // Create arrays for many fields
+ let mut field_arrays: Vec<ArrayRef> = Vec::new();
+ let mut fields = Vec::new();
+
+ for field_idx in 0..num_fields {
+ match field_idx % 4 {
+ 0 => {
+ // Int64 fields
+ let array = Int64Array::from(
+ (0..size)
+ .map(|i| Some(i as i64 + field_idx as i64))
+ .collect::<Vec<_>>(),
+ );
+ field_arrays.push(Arc::new(array));
+ fields.push(Field::new(
+ format!("int_field_{}", field_idx),
+ DataType::Int64,
+ false,
+ ));
+ }
+ 1 => {
+ // Int32 fields
+ let array = Int32Array::from(
+ (0..size)
+ .map(|i| Some((i % 1000) as i32 + field_idx as
i32))
+ .collect::<Vec<_>>(),
+ );
+ field_arrays.push(Arc::new(array));
+ fields.push(Field::new(
+ format!("int32_field_{}", field_idx),
+ DataType::Int32,
+ false,
+ ));
+ }
+ 2 => {
+ // Float64 fields
+ let array = Float64Array::from(
+ (0..size)
+ .map(|i| Some(i as f64 * 0.1 + field_idx as f64))
+ .collect::<Vec<_>>(),
+ );
+ field_arrays.push(Arc::new(array));
+ fields.push(Field::new(
+ format!("float_field_{}", field_idx),
+ DataType::Float64,
+ false,
+ ));
+ }
+ _ => {
+ // Binary fields
+ let binary_data: Vec<Option<&[u8]>> = (0..size)
+ .map(|i| {
+ // Use static data to avoid lifetime issues in
tests
+ match i % 3 {
+ 0 => Some(b"test_data_0" as &[u8]),
+ 1 => Some(b"test_data_1" as &[u8]),
+ _ => Some(b"test_data_2" as &[u8]),
+ }
+ })
+ .collect();
+ let array = BinaryArray::from(binary_data);
+ field_arrays.push(Arc::new(array));
+ fields.push(Field::new(
+ format!("binary_field_{}", field_idx),
+ DataType::Binary,
+ false,
+ ));
+ }
+ }
+ }
+
+ let struct_array = StructArray::new(Fields::from(fields),
field_arrays, None);
+
+ let result = cast_to_variant(&struct_array).unwrap();
+ assert_eq!(result.len(), size);
+
+ // Verify a sample of rows
+ for sample_idx in [0, size / 4, size / 2, size - 1] {
+ let variant = result.value(sample_idx);
+ let obj = variant.as_object().unwrap();
+
+ // Should have all fields
+ assert_eq!(obj.len(), num_fields);
+
+ // Verify a few field values
+ if let Some(int_field_0) = obj.get("int_field_0") {
+ assert_eq!(int_field_0, Variant::from(sample_idx as i64));
+ }
+ if let Some(float_field_2) = obj.get("float_field_2") {
+ assert_eq!(float_field_2, Variant::from(sample_idx as f64 *
0.1 + 2.0));
+ }
+ }
+ }
+
+ #[test]
+ fn test_cast_to_variant_nested_struct() {
+ // Test nested struct: person with location struct
+ let id_array = Int64Array::from(vec![Some(1001), Some(1002)]);
+ let x_array = Float64Array::from(vec![Some(40.7), Some(37.8)]);
+ let y_array = Float64Array::from(vec![Some(-74.0), Some(-122.4)]);
+
+ // Create location struct
+ let location_fields = Fields::from(vec![
+ Field::new("x", DataType::Float64, true),
+ Field::new("y", DataType::Float64, true),
+ ]);
+ let location_struct = StructArray::new(
+ location_fields.clone(),
+ vec![Arc::new(x_array), Arc::new(y_array)],
+ None,
+ );
+
+ // Create person struct containing location
+ let person_fields = Fields::from(vec![
+ Field::new("id", DataType::Int64, true),
+ Field::new("location", DataType::Struct(location_fields), true),
+ ]);
+ let person_struct = StructArray::new(
+ person_fields,
+ vec![Arc::new(id_array), Arc::new(location_struct)],
+ None,
+ );
+
+ let result = cast_to_variant(&person_struct).unwrap();
+ assert_eq!(result.len(), 2);
+
+ // Check first row
+ let variant1 = result.value(0);
+ let obj1 = variant1.as_object().unwrap();
+ assert_eq!(obj1.get("id"), Some(Variant::from(1001i64)));
+
+ let location_variant1 = obj1.get("location").unwrap();
+ let location_obj1 = location_variant1.as_object().unwrap();
+ assert_eq!(location_obj1.get("x"), Some(Variant::from(40.7f64)));
+ assert_eq!(location_obj1.get("y"), Some(Variant::from(-74.0f64)));
+
+ // Check second row
+ let variant2 = result.value(1);
+ let obj2 = variant2.as_object().unwrap();
+ assert_eq!(obj2.get("id"), Some(Variant::from(1002i64)));
+
+ let location_variant2 = obj2.get("location").unwrap();
+ let location_obj2 = location_variant2.as_object().unwrap();
+ assert_eq!(location_obj2.get("x"), Some(Variant::from(37.8f64)));
+ assert_eq!(location_obj2.get("y"), Some(Variant::from(-122.4f64)));
+ }
+
/// Converts the given `Array` to a `VariantArray` and tests the conversion
/// against the expected values. It also tests the handling of nulls by
/// setting one element to null and verifying the output.