klion26 commented on code in PR #8166: URL: https://github.com/apache/arrow-rs/pull/8166#discussion_r2311900100
########## parquet-variant-compute/src/variant_get/output/row_builder.rs: ########## @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::ArrayRef; +use arrow::datatypes; +use arrow::datatypes::ArrowPrimitiveType; +use arrow::error::{ArrowError, Result}; +use parquet_variant::{Variant, VariantPath}; + +use crate::VariantArrayBuilder; + +use std::sync::Arc; + +pub(crate) fn make_shredding_row_builder<'a>( + //metadata: &BinaryViewArray, + path: VariantPath<'a>, + data_type: Option<&'a datatypes::DataType>, +) -> Result<Box<dyn VariantShreddingRowBuilder + 'a>> { + use arrow::array::PrimitiveBuilder; + use datatypes::Int32Type; + + // support non-empty paths (field access) and some empty path cases + if path.is_empty() { + return match data_type { + Some(datatypes::DataType::Int32) => { + // Return PrimitiveInt32Builder for type conversion + let builder = PrimitiveVariantShreddingRowBuilder { + builder: PrimitiveBuilder::<Int32Type>::new(), + }; + Ok(Box::new(builder)) + } + None => { + // Return VariantArrayBuilder for VariantArray output + let builder = VariantArrayShreddingRowBuilder::new(16); + Ok(Box::new(builder)) + } + _ => { + // only Int32 supported for empty paths + Err(ArrowError::NotYetImplemented(format!( + "variant_get with empty path and data_type={:?} not yet implemented", + data_type + ))) + } + }; + } + + // Non-empty paths: field access functionality + // Helper macro to reduce duplication when wrapping builders with path functionality + macro_rules! wrap_with_path { + ($inner_builder:expr) => { + Ok(Box::new(VariantPathRowBuilder { + builder: $inner_builder, + path, + }) as Box<dyn VariantShreddingRowBuilder + 'a>) + }; + } + + match data_type { + Some(datatypes::DataType::Int32) => { + // Create a primitive builder and wrap it with path functionality + let inner_builder = PrimitiveVariantShreddingRowBuilder { + builder: PrimitiveBuilder::<Int32Type>::new(), + }; + wrap_with_path!(inner_builder) + } + None => { + // Create a variant array builder and wrap it with path functionality + let inner_builder = VariantArrayShreddingRowBuilder::new(16); + wrap_with_path!(inner_builder) + } + _ => { + // only Int32 and VariantArray supported + Err(ArrowError::NotYetImplemented(format!( + "variant_get with path={:?} and data_type={:?} not yet implemented", + path, data_type + ))) + } + } +} + +/// Builder for shredding variant values into strongly typed Arrow arrays. +/// +/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly +/// with casting of leaf values to specific types. +pub(crate) trait VariantShreddingRowBuilder { + fn append_null(&mut self) -> Result<()>; + + fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool>; + + fn finish(&mut self) -> Result<ArrayRef>; +} + +/// A thin wrapper whose only job is to extract a specific path from a variant value and pass the +/// result to a nested builder. +struct VariantPathRowBuilder<'a, T: VariantShreddingRowBuilder> { Review Comment: The difference between `VariantPathRowBuilder` and `PrimitiveVariantShreddingRowBuilder` is that `VariantPathRowBuilder` contains a path that is used to retrieve the value in `append_value`. They can be merged if we move the value extraction logic to the caller. Will this become cleaner? ########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -15,51 +15,240 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{Array, ArrayRef}, + array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, compute::CastOptions, + datatypes::Field, error::Result, }; -use arrow_schema::{ArrowError, FieldRef}; -use parquet_variant::VariantPath; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use parquet_variant::{VariantPath, VariantPathElement}; use crate::variant_array::ShreddingState; -use crate::variant_get::output::instantiate_output_builder; -use crate::VariantArray; +use crate::{variant_array::ShreddedVariantFieldArray, VariantArray}; + +use std::sync::Arc; mod output; +pub(crate) enum ShreddedPathStep<'a> { + /// Path step succeeded, return the new shredding state + Success(&'a ShreddingState), + /// The path element is not present in the `typed_value` column and there is no `value` column, + /// so we we know it does not exist. It, and all paths under it, are all-NULL. + Missing, + /// The path element is not present in the `typed_value` column and must be retrieved from the `value` + /// column instead. The caller should be prepared to handle any value, including the requested + /// type, an arbitrary "wrong" type, or `Variant::Null`. + NotShredded, +} + +/// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try to take one path step +/// deeper. For a `VariantPathElement::Field`, the step fails if there is no `typed_value` at this +/// level, or if `typed_value` is not a struct, or if the requested field name does not exist. +/// +/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. +pub(crate) fn follow_shredded_path_element<'a>( + shredding_state: &'a ShreddingState, + path_element: &VariantPathElement<'_>, + cast_options: &CastOptions, +) -> Result<ShreddedPathStep<'a>> { + // If the requested path element is not present in `typed_value`, and `value` is missing, then + // we know it does not exist; it, and all paths under it, are all-NULL. + let missing_path_step = || { + let Some(_value_field) = shredding_state.value_field() else { Review Comment: We just want to return different results for different `value_field`, does `match` make it cleaner here? ########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -15,51 +15,240 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{Array, ArrayRef}, + array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, compute::CastOptions, + datatypes::Field, error::Result, }; -use arrow_schema::{ArrowError, FieldRef}; -use parquet_variant::VariantPath; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use parquet_variant::{VariantPath, VariantPathElement}; use crate::variant_array::ShreddingState; -use crate::variant_get::output::instantiate_output_builder; -use crate::VariantArray; +use crate::{variant_array::ShreddedVariantFieldArray, VariantArray}; + +use std::sync::Arc; mod output; +pub(crate) enum ShreddedPathStep<'a> { + /// Path step succeeded, return the new shredding state + Success(&'a ShreddingState), + /// The path element is not present in the `typed_value` column and there is no `value` column, + /// so we we know it does not exist. It, and all paths under it, are all-NULL. + Missing, + /// The path element is not present in the `typed_value` column and must be retrieved from the `value` + /// column instead. The caller should be prepared to handle any value, including the requested + /// type, an arbitrary "wrong" type, or `Variant::Null`. + NotShredded, +} + +/// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try to take one path step +/// deeper. For a `VariantPathElement::Field`, the step fails if there is no `typed_value` at this +/// level, or if `typed_value` is not a struct, or if the requested field name does not exist. +/// +/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. +pub(crate) fn follow_shredded_path_element<'a>( + shredding_state: &'a ShreddingState, + path_element: &VariantPathElement<'_>, + cast_options: &CastOptions, +) -> Result<ShreddedPathStep<'a>> { + // If the requested path element is not present in `typed_value`, and `value` is missing, then + // we know it does not exist; it, and all paths under it, are all-NULL. + let missing_path_step = || { + let Some(_value_field) = shredding_state.value_field() else { + return ShreddedPathStep::Missing; + }; + ShreddedPathStep::NotShredded + }; + + let Some(typed_value) = shredding_state.typed_value_field() else { + return Ok(missing_path_step()); + }; + + match path_element { + VariantPathElement::Field { name } => { + // Try to step into the requested field name of a struct. + // First, try to downcast to StructArray + let Some(struct_array) = typed_value.as_any().downcast_ref::<StructArray>() else { + // Downcast failure - if strict cast options are enabled, this should be an error + if !cast_options.safe { Review Comment: If `typed_value` can't be cast to `StructArray`, and `cast_options.safe` is `false`, then we will return an `Err`. Will there be some chance the path located in the `value`, if it is yes, return an `Err` here is expected? ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -102,31 +105,57 @@ impl VariantArray { ))); }; - // Find the value field, if present - let value = inner - .column_by_name("value") - .map(|v| { - v.as_binary_view_opt().ok_or_else(|| { - ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - v.data_type() - )) - }) - }) - .transpose()?; - - // Find the typed_value field, if present - let typed_value = inner.column_by_name("typed_value"); + // Extract value and typed_value fields + let value = if let Some(value_col) = inner.column_by_name("value") { + if let Some(binary_view) = value_col.as_binary_view_opt() { + Some(binary_view.clone()) + } else { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + value_col.data_type() + ))); + } + } else { + None + }; + let typed_value = inner.column_by_name("typed_value").cloned(); // Note these clones are cheap, they just bump the ref count - let inner = inner.clone(); - let shredding_state = - ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?; - Ok(Self { + inner: inner.clone(), + metadata: metadata.clone(), + shredding_state: ShreddingState::try_new(metadata.clone(), value, typed_value)?, + }) + } + + #[allow(unused)] + pub(crate) fn from_parts( Review Comment: Do we need to add some doc for this? ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -102,31 +105,57 @@ impl VariantArray { ))); }; - // Find the value field, if present - let value = inner - .column_by_name("value") - .map(|v| { - v.as_binary_view_opt().ok_or_else(|| { - ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - v.data_type() - )) - }) - }) - .transpose()?; - - // Find the typed_value field, if present - let typed_value = inner.column_by_name("typed_value"); + // Extract value and typed_value fields + let value = if let Some(value_col) = inner.column_by_name("value") { + if let Some(binary_view) = value_col.as_binary_view_opt() { + Some(binary_view.clone()) + } else { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + value_col.data_type() + ))); + } + } else { + None + }; + let typed_value = inner.column_by_name("typed_value").cloned(); // Note these clones are cheap, they just bump the ref count - let inner = inner.clone(); - let shredding_state = - ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?; - Ok(Self { + inner: inner.clone(), + metadata: metadata.clone(), + shredding_state: ShreddingState::try_new(metadata.clone(), value, typed_value)?, + }) + } + + #[allow(unused)] + pub(crate) fn from_parts( + metadata: BinaryViewArray, + value: Option<BinaryViewArray>, + typed_value: Option<ArrayRef>, + nulls: Option<NullBuffer>, + ) -> Self { + let mut builder = + StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone())); + if let Some(value) = value.clone() { + builder = builder.with_field("value", Arc::new(value)); + } + if let Some(typed_value) = typed_value.clone() { + builder = builder.with_field("typed_value", typed_value); + } + if let Some(nulls) = nulls { Review Comment: `value` and `typed_value` both do copy here; do we need to align `null` with them? ########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -500,4 +668,1013 @@ mod test { VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), ) } + /// This test manually constructs a shredded variant array representing objects + /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field + /// as VariantArray using variant_get. + #[test] + fn test_shredded_object_field_access() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field as VariantArray first + let options = GetOptions::new_with_path(VariantPath::from("x")); + let result = variant_get(&array, options).unwrap(); + + let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result_variant.len(), 2); + + // Row 0: expect x=1 + assert_eq!(result_variant.value(0), Variant::Int32(1)); + // Row 1: expect x=42 + assert_eq!(result_variant.value(1), Variant::Int32(42)); + } + + /// Test extracting shredded object field with type conversion + #[test] + fn test_shredded_object_field_as_int32() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field as Int32Array (type conversion) + let field = Field::new("x", DataType::Int32, false); + let options = GetOptions::new_with_path(VariantPath::from("x")) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + // Should get Int32Array + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); + assert_eq!(&result, &expected); + } + + /// Helper function to create a shredded variant array representing objects + /// + /// This creates an array that represents: + /// Row 0: {"x": 1, "y": "foo"} (x is shredded, y is in value field) + /// Row 1: {"x": 42} (x is shredded, perfect shredding) + /// + /// The physical layout follows the shredding spec where: + /// - metadata: contains object metadata + /// - typed_value: StructArray with field "x" (ShreddedVariantFieldArray) + /// - value: contains fallback for unshredded fields like {"y": "foo"} + /// - The "x" field has typed_value=Int32Array and value=NULL (perfect shredding) + fn shredded_object_with_x_field_variant_array() -> ArrayRef { + // Create the base metadata for objects + let (metadata, y_field_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("x", Variant::Int32(42)); + obj.insert("y", Variant::from("foo")); + obj.finish().unwrap(); + builder.finish() + }; + + // Create metadata array (same for both rows) + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 2)); + + // Create the main value field per the 3-step shredding spec: + // Step 2: If field not in shredding schema, check value field + // Row 0: {"y": "foo"} (y is not shredded, stays in value for step 2) + // Row 1: {} (empty object - no unshredded fields) + let empty_object_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + + let value_array = BinaryViewArray::from(vec![ + Some(y_field_value.as_slice()), // Row 0 has {"y": "foo"} + Some(empty_object_value.as_slice()), // Row 1 has {} + ]); + + // Create the "x" field as a ShreddedVariantFieldArray + // This represents the shredded Int32 values for the "x" field + let x_field_typed_value = Int32Array::from(vec![Some(1), Some(42)]); + + // For perfect shredding of the x field, no "value" column, only typed_value + let x_field_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(x_field_typed_value)) + .build(); + + // Wrap the x field struct in a ShreddedVariantFieldArray + let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + .expect("should create ShreddedVariantFieldArray"); + + // Create the main typed_value as a struct containing the "x" field + let typed_value_fields = Fields::from(vec![ + Field::new("x", x_field_shredded.data_type().clone(), true) + ]); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![Arc::new(x_field_shredded)], + None, // No nulls - both rows have the object structure + ).unwrap(); + + // Create the main VariantArray + let main_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_struct)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array"), + ) + } + + /// Simple test to check if nested paths are supported by current implementation + #[test] + fn test_simple_nested_path_support() { Review Comment: Do we need to assert the expected behavior in this test function? ########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -500,4 +668,1013 @@ mod test { VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), ) } + /// This test manually constructs a shredded variant array representing objects + /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field + /// as VariantArray using variant_get. + #[test] + fn test_shredded_object_field_access() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field as VariantArray first + let options = GetOptions::new_with_path(VariantPath::from("x")); + let result = variant_get(&array, options).unwrap(); + + let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result_variant.len(), 2); + + // Row 0: expect x=1 + assert_eq!(result_variant.value(0), Variant::Int32(1)); + // Row 1: expect x=42 + assert_eq!(result_variant.value(1), Variant::Int32(42)); + } + + /// Test extracting shredded object field with type conversion + #[test] + fn test_shredded_object_field_as_int32() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field as Int32Array (type conversion) + let field = Field::new("x", DataType::Int32, false); + let options = GetOptions::new_with_path(VariantPath::from("x")) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + // Should get Int32Array + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); + assert_eq!(&result, &expected); + } + + /// Helper function to create a shredded variant array representing objects + /// + /// This creates an array that represents: + /// Row 0: {"x": 1, "y": "foo"} (x is shredded, y is in value field) + /// Row 1: {"x": 42} (x is shredded, perfect shredding) + /// + /// The physical layout follows the shredding spec where: + /// - metadata: contains object metadata + /// - typed_value: StructArray with field "x" (ShreddedVariantFieldArray) + /// - value: contains fallback for unshredded fields like {"y": "foo"} + /// - The "x" field has typed_value=Int32Array and value=NULL (perfect shredding) + fn shredded_object_with_x_field_variant_array() -> ArrayRef { + // Create the base metadata for objects + let (metadata, y_field_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("x", Variant::Int32(42)); + obj.insert("y", Variant::from("foo")); + obj.finish().unwrap(); + builder.finish() + }; + + // Create metadata array (same for both rows) + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 2)); + + // Create the main value field per the 3-step shredding spec: + // Step 2: If field not in shredding schema, check value field + // Row 0: {"y": "foo"} (y is not shredded, stays in value for step 2) + // Row 1: {} (empty object - no unshredded fields) + let empty_object_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + + let value_array = BinaryViewArray::from(vec![ + Some(y_field_value.as_slice()), // Row 0 has {"y": "foo"} + Some(empty_object_value.as_slice()), // Row 1 has {} + ]); + + // Create the "x" field as a ShreddedVariantFieldArray + // This represents the shredded Int32 values for the "x" field + let x_field_typed_value = Int32Array::from(vec![Some(1), Some(42)]); + + // For perfect shredding of the x field, no "value" column, only typed_value + let x_field_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(x_field_typed_value)) + .build(); + + // Wrap the x field struct in a ShreddedVariantFieldArray + let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + .expect("should create ShreddedVariantFieldArray"); + + // Create the main typed_value as a struct containing the "x" field + let typed_value_fields = Fields::from(vec![ + Field::new("x", x_field_shredded.data_type().clone(), true) + ]); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![Arc::new(x_field_shredded)], + None, // No nulls - both rows have the object structure + ).unwrap(); + + // Create the main VariantArray + let main_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_struct)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array"), + ) + } + + /// Simple test to check if nested paths are supported by current implementation + #[test] + fn test_simple_nested_path_support() { + // Check: How does VariantPath parse different strings? + println!("Testing path parsing:"); + + let path_x = VariantPath::from("x"); + let elements_x: Vec<_> = path_x.iter().collect(); + println!(" 'x' -> {} elements: {:?}", elements_x.len(), elements_x); + + let path_ax = VariantPath::from("a.x"); + let elements_ax: Vec<_> = path_ax.iter().collect(); + println!(" 'a.x' -> {} elements: {:?}", elements_ax.len(), elements_ax); + + let path_ax_alt = VariantPath::from("$.a.x"); + let elements_ax_alt: Vec<_> = path_ax_alt.iter().collect(); + println!(" '$.a.x' -> {} elements: {:?}", elements_ax_alt.len(), elements_ax_alt); + + let path_nested = VariantPath::from("a").join("x"); + let elements_nested: Vec<_> = path_nested.iter().collect(); + println!(" VariantPath::from('a').join('x') -> {} elements: {:?}", elements_nested.len(), elements_nested); + + // Use your existing simple test data but try "a.x" instead of "x" + let array = shredded_object_with_x_field_variant_array(); + + // Test if variant_get with REAL nested path throws not implemented error + let real_nested_path = VariantPath::from("a").join("x"); + let options = GetOptions::new_with_path(real_nested_path); + let result = variant_get(&array, options); + + match result { + Ok(_) => { + println!("Nested path 'a.x' works unexpectedly!"); + }, + Err(e) => { + println!("Nested path 'a.x' error: {}", e); + if e.to_string().contains("not yet implemented") + || e.to_string().contains("NotYetImplemented") { + println!("This is expected - nested paths are not implemented"); + return; + } + // Any other error is also expected for now + println!("This shows nested paths need implementation"); + } + } + } + + /// Test comprehensive variant_get scenarios with Int32 conversion + /// Test depth 0: Direct field access "x" with Int32 conversion + /// Covers shredded vs non-shredded VariantArrays for simple field access + #[test] + fn test_depth_0_int32_conversion() { + println!("=== Testing Depth 0: Direct field access ==="); + + // Non-shredded test data: [{"x": 42}, {"x": "foo"}, {"y": 10}] + let unshredded_array = create_depth_0_test_data(); + + let field = Field::new("result", DataType::Int32, true); + let path = VariantPath::from("x"); + let options = GetOptions::new_with_path(path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&unshredded_array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(42), // {"x": 42} -> 42 + None, // {"x": "foo"} -> NULL (type mismatch) + None, // {"y": 10} -> NULL (field missing) + ])); + assert_eq!(&result, &expected); + println!("Depth 0 (unshredded) passed"); + + // Shredded test data: using simplified approach based on working pattern + let shredded_array = create_depth_0_shredded_test_data_simple(); + + let field = Field::new("result", DataType::Int32, true); + let path = VariantPath::from("x"); + let options = GetOptions::new_with_path(path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&shredded_array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(42), // {"x": 42} -> 42 (from typed_value) + None, // {"x": "foo"} -> NULL (type mismatch, from value field) + ])); + assert_eq!(&result, &expected); + println!("Depth 0 (shredded) passed"); + } + + /// Test depth 1: Single nested field access "a.x" with Int32 conversion + /// Covers shredded vs non-shredded VariantArrays for nested field access + #[test] + fn test_depth_1_int32_conversion() { + println!("=== Testing Depth 1: Single nested field access ==="); + + // Non-shredded test data from the GitHub issue + let unshredded_array = create_nested_path_test_data(); + + let field = Field::new("result", DataType::Int32, true); + let path = VariantPath::from("a.x"); // Dot notation! + let options = GetOptions::new_with_path(path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&unshredded_array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(55), // {"a": {"x": 55}} -> 55 + None, // {"a": {"x": "foo"}} -> NULL (type mismatch) + ])); + assert_eq!(&result, &expected); + println!("Depth 1 (unshredded) passed"); + + // Shredded test data: depth 1 nested shredding + let shredded_array = create_depth_1_shredded_test_data_working(); + + let field = Field::new("result", DataType::Int32, true); + let path = VariantPath::from("a.x"); // Dot notation! + let options = GetOptions::new_with_path(path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&shredded_array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(55), // {"a": {"x": 55}} -> 55 (from nested shredded x) + None, // {"a": {"x": "foo"}} -> NULL (type mismatch in nested value) + ])); + assert_eq!(&result, &expected); + println!("Depth 1 (shredded) passed"); + } + + /// Test depth 2: Double nested field access "a.b.x" with Int32 conversion + /// Covers shredded vs non-shredded VariantArrays for deeply nested field access + #[test] + fn test_depth_2_int32_conversion() { + println!("=== Testing Depth 2: Double nested field access ==="); + + // Non-shredded test data: [{"a": {"b": {"x": 100}}}, {"a": {"b": {"x": "bar"}}}, {"a": {"b": {"y": 200}}}] + let unshredded_array = create_depth_2_test_data(); + + let field = Field::new("result", DataType::Int32, true); + let path = VariantPath::from("a.b.x"); // Double nested dot notation! + let options = GetOptions::new_with_path(path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&unshredded_array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(100), // {"a": {"b": {"x": 100}}} -> 100 + None, // {"a": {"b": {"x": "bar"}}} -> NULL (type mismatch) + None, // {"a": {"b": {"y": 200}}} -> NULL (field missing) + ])); + assert_eq!(&result, &expected); + println!("Depth 2 (unshredded) passed"); + + // Shredded test data: depth 2 nested shredding + let shredded_array = create_depth_2_shredded_test_data_working(); + + let field = Field::new("result", DataType::Int32, true); + let path = VariantPath::from("a.b.x"); // Double nested dot notation! + let options = GetOptions::new_with_path(path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&shredded_array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(100), // {"a": {"b": {"x": 100}}} -> 100 (from deeply nested shredded x) + None, // {"a": {"b": {"x": "bar"}}} -> NULL (type mismatch in deep value) + None, // {"a": {"b": {"y": 200}}} -> NULL (field missing in deep structure) + ])); + assert_eq!(&result, &expected); + println!("Depth 2 (shredded) passed"); + } + + /// Test that demonstrates what CURRENTLY WORKS + /// + /// This shows that nested path functionality does work, but only when the + /// test data matches what the current implementation expects + #[test] + fn test_current_nested_path_functionality() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field (single level) - this works + let single_path = VariantPath::from("x"); + let field = Field::new("result", DataType::Int32, true); + let options = GetOptions::new_with_path(single_path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + println!("Single path 'x' works - result: {:?}", result); + + // Test: Try nested path "a.x" - this is what we need to implement + let nested_path = VariantPath::from("a").join("x"); + let field = Field::new("result", DataType::Int32, true); + let options = GetOptions::new_with_path(nested_path) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + println!("Nested path 'a.x' result: {:?}", result); + } + + /// Create test data for depth 0 (direct field access) + /// [{"x": 42}, {"x": "foo"}, {"y": 10}] + fn create_depth_0_test_data() -> ArrayRef { + let mut builder = crate::VariantArrayBuilder::new(3); + + // Row 1: {"x": 42} + { + let json_str = r#"{"x": 42}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + // Row 2: {"x": "foo"} + { + let json_str = r#"{"x": "foo"}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + // Row 3: {"y": 10} (missing "x" field) + { + let json_str = r#"{"y": 10}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + Arc::new(builder.build()) + } + + /// Create test data for depth 1 (single nested field) + /// This represents the exact scenarios from the GitHub issue: "a.x" + fn create_nested_path_test_data() -> ArrayRef { + let mut builder = crate::VariantArrayBuilder::new(2); + + // Row 1: {"a": {"x": 55}, "b": 42} + { + let json_str = r#"{"a": {"x": 55}, "b": 42}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + // Row 2: {"a": {"x": "foo"}, "b": 42} + { + let json_str = r#"{"a": {"x": "foo"}, "b": 42}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + Arc::new(builder.build()) + } + + /// Create test data for depth 2 (double nested field) + /// [{"a": {"b": {"x": 100}}}, {"a": {"b": {"x": "bar"}}}, {"a": {"b": {"y": 200}}}] + fn create_depth_2_test_data() -> ArrayRef { + let mut builder = crate::VariantArrayBuilder::new(3); + + // Row 1: {"a": {"b": {"x": 100}}} + { + let json_str = r#"{"a": {"b": {"x": 100}}}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + // Row 2: {"a": {"b": {"x": "bar"}}} + { + let json_str = r#"{"a": {"b": {"x": "bar"}}}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + // Row 3: {"a": {"b": {"y": 200}}} (missing "x" field) + { + let json_str = r#"{"a": {"b": {"y": 200}}}"#; + let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); + if let Ok(variant_array) = json_to_variant(&string_array) { + builder.append_variant(variant_array.value(0)); + } else { + builder.append_null(); + } + } + + Arc::new(builder.build()) + } + + /// Create simple shredded test data for depth 0 using a simplified working pattern + /// Creates 2 rows: [{"x": 42}, {"x": "foo"}] with "x" shredded where possible + fn create_depth_0_shredded_test_data_simple() -> ArrayRef { + // Create base metadata using the working pattern + let (metadata, string_x_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("x", Variant::from("foo")); + obj.finish().unwrap(); + builder.finish() + }; + + // Metadata array (same for both rows) + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 2)); + + // Value array following the 3-step shredding spec: + // Row 0: {} (x is shredded, no unshredded fields) + // Row 1: {"x": "foo"} (x is a string, can't be shredded to Int32) + let empty_object_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + + let value_array = BinaryViewArray::from(vec![ + Some(empty_object_value.as_slice()), // Row 0: {} (x shredded out) + Some(string_x_value.as_slice()), // Row 1: {"x": "foo"} (fallback) + ]); + + // Create the "x" field as a ShreddedVariantFieldArray + let x_field_typed_value = Int32Array::from(vec![Some(42), None]); + + // For the x field, only typed_value (perfect shredding when possible) + let x_field_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(x_field_typed_value)) + .build(); + + let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + .expect("should create ShreddedVariantFieldArray"); + + // Create the main typed_value as a struct containing the "x" field + let typed_value_fields = Fields::from(vec![ + Field::new("x", x_field_shredded.data_type().clone(), true) + ]); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![Arc::new(x_field_shredded)], + None, + ).unwrap(); + + // Build final VariantArray + let struct_array = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_struct)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray"), + ) + } + + /// Create working depth 1 shredded test data based on the existing working pattern + /// This creates a properly structured shredded variant for "a.x" where: + /// - Row 0: {"a": {"x": 55}, "b": 42} with a.x shredded into typed_value + /// - Row 1: {"a": {"x": "foo"}, "b": 42} with a.x fallback to value field due to type mismatch + fn create_depth_1_shredded_test_data_working() -> ArrayRef { + // Create metadata following the working pattern from shredded_object_with_x_field_variant_array + let (metadata, _) = { + // Create nested structure: {"a": {"x": 55}, "b": 42} + let a_variant = { + let mut a_builder = parquet_variant::VariantBuilder::new(); + let mut a_obj = a_builder.new_object(); + a_obj.insert("x", Variant::Int32(55)); // "a.x" field (shredded when possible) + a_obj.finish().unwrap() + }; + + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("a", a_variant); + obj.insert("b", Variant::Int32(42)); + obj.finish().unwrap(); + builder.finish() + }; + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 2)); + + // Create value arrays for the fallback case + // Following the spec: if field cannot be shredded, it stays in value + let empty_object_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + + // Row 1 fallback: use the working pattern from the existing shredded test + // This avoids metadata issues by using the simple fallback approach + let row1_fallback = { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("fallback", Variant::from("data")); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + + let value_array = BinaryViewArray::from(vec![ + Some(empty_object_value.as_slice()), // Row 0: {} (everything shredded except b in unshredded fields) + Some(row1_fallback.as_slice()), // Row 1: {"a": {"x": "foo"}, "b": 42} (a.x can't be shredded) + ]); + + // Create the nested shredded structure + // Level 2: x field (the deepest level) + let x_typed_value = Int32Array::from(vec![Some(55), None]); + let x_field_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(x_typed_value)) + .build(); + let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + .expect("should create ShreddedVariantFieldArray for x"); + + // Level 1: a field containing x field + value field for fallbacks + // The "a" field needs both typed_value (for shredded x) and value (for fallback cases) + + // Create the value field for "a" (for cases where a.x can't be shredded) + let a_value_data = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + let a_value_array = BinaryViewArray::from(vec![ + None, // Row 0: x is shredded, so no value fallback needed + Some(a_value_data.as_slice()), // Row 1: fallback for a.x="foo" (but logic will check typed_value first) + ]); + + let a_inner_fields = Fields::from(vec![ + Field::new("x", x_field_shredded.data_type().clone(), true) + ]); + let a_inner_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(StructArray::try_new( + a_inner_fields, + vec![Arc::new(x_field_shredded)], + None, + ).unwrap())) + .with_field("value", Arc::new(a_value_array)) + .build(); + let a_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(a_inner_struct)) + .expect("should create ShreddedVariantFieldArray for a"); + + // Level 0: main typed_value struct containing a field + let typed_value_fields = Fields::from(vec![ + Field::new("a", a_field_shredded.data_type().clone(), true) + ]); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![Arc::new(a_field_shredded)], + None, + ).unwrap(); + + // Build final VariantArray + let struct_array = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_struct)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray"), + ) + } + + /// Create working depth 2 shredded test data for "a.b.x" paths + /// This creates a 3-level nested shredded structure where: + /// - Row 0: {"a": {"b": {"x": 100}}} with a.b.x shredded into typed_value + /// - Row 1: {"a": {"b": {"x": "bar"}}} with type mismatch fallback + /// - Row 2: {"a": {"b": {"y": 200}}} with missing field fallback + fn create_depth_2_shredded_test_data_working() -> ArrayRef { + // Create metadata following the working pattern + let (metadata, _) = { + // Create deeply nested structure: {"a": {"b": {"x": 100}}} + let b_variant = { + let mut b_builder = parquet_variant::VariantBuilder::new(); + let mut b_obj = b_builder.new_object(); + b_obj.insert("x", Variant::Int32(100)); + b_obj.finish().unwrap() + }; + + let a_variant = { + let mut a_builder = parquet_variant::VariantBuilder::new(); + let mut a_obj = a_builder.new_object(); + a_obj.insert("b", b_variant); + a_obj.finish().unwrap() + }; + + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("a", a_variant); // "a" field containing b + obj.finish().unwrap(); + builder.finish() + }; + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + + // Create value arrays for fallback cases + let empty_object_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + + // Simple fallback values - avoiding complex nested metadata + let value_array = BinaryViewArray::from(vec![ + Some(empty_object_value.as_slice()), // Row 0: fully shredded + Some(empty_object_value.as_slice()), // Row 1: fallback (simplified) + Some(empty_object_value.as_slice()), // Row 2: fallback (simplified) + ]); + + // Create the deeply nested shredded structure: a.b.x + + // Level 3: x field (deepest level) + let x_typed_value = Int32Array::from(vec![Some(100), None, None]); + let x_field_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(x_typed_value)) + .build(); + let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + .expect("should create ShreddedVariantFieldArray for x"); + + // Level 2: b field containing x field + value field + let b_value_data = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + let b_value_array = BinaryViewArray::from(vec![ + None, // Row 0: x is shredded + Some(b_value_data.as_slice()), // Row 1: fallback for b.x="bar" + Some(b_value_data.as_slice()), // Row 2: fallback for b.y=200 + ]); + + let b_inner_fields = Fields::from(vec![ + Field::new("x", x_field_shredded.data_type().clone(), true) + ]); + let b_inner_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(StructArray::try_new( + b_inner_fields, + vec![Arc::new(x_field_shredded)], + None, + ).unwrap())) + .with_field("value", Arc::new(b_value_array)) + .build(); + let b_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(b_inner_struct)) + .expect("should create ShreddedVariantFieldArray for b"); + + // Level 1: a field containing b field + value field + let a_value_data = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish().unwrap(); + let (_, value) = builder.finish(); + value + }; + let a_value_array = BinaryViewArray::from(vec![ + None, // Row 0: b is shredded + Some(a_value_data.as_slice()), // Row 1: fallback for a.b.* + Some(a_value_data.as_slice()), // Row 2: fallback for a.b.* + ]); + + let a_inner_fields = Fields::from(vec![ + Field::new("b", b_field_shredded.data_type().clone(), true) + ]); + let a_inner_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("typed_value", Arc::new(StructArray::try_new( + a_inner_fields, + vec![Arc::new(b_field_shredded)], + None, + ).unwrap())) + .with_field("value", Arc::new(a_value_array)) + .build(); + let a_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(a_inner_struct)) + .expect("should create ShreddedVariantFieldArray for a"); + + // Level 0: main typed_value struct containing a field + let typed_value_fields = Fields::from(vec![ + Field::new("a", a_field_shredded.data_type().clone(), true) + ]); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![Arc::new(a_field_shredded)], + None, + ).unwrap(); + + // Build final VariantArray + let struct_array = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_struct)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray"), + ) + } + + #[test] + fn test_strict_cast_options_downcast_failure() { + use arrow::error::ArrowError; + use arrow::compute::CastOptions; + use arrow::datatypes::{DataType, Field}; + use std::sync::Arc; + use parquet_variant::VariantPath; + + // Use the existing simple test data that has Int32 as typed_value + let variant_array = perfectly_shredded_int32_variant_array(); + + // Try to access a field with safe cast options (should return NULLs) + let safe_options = GetOptions { + path: VariantPath::from("nonexistent_field"), + as_type: Some(Arc::new(Field::new("result", DataType::Int32, true))), + cast_options: CastOptions::default(), // safe = true + }; + + let variant_array_ref: Arc<dyn arrow::array::Array> = variant_array.clone(); + let result = variant_get(&variant_array_ref, safe_options); + // Should succeed and return NULLs (safe behavior) + assert!(result.is_ok()); + let result_array = result.unwrap(); + assert_eq!(result_array.len(), 3); + assert!(result_array.is_null(0)); + assert!(result_array.is_null(1)); + assert!(result_array.is_null(2)); + + // Try to access a field with strict cast options (should error) + let strict_options = GetOptions { + path: VariantPath::from("nonexistent_field"), + as_type: Some(Arc::new(Field::new("result", DataType::Int32, true))), + cast_options: CastOptions { safe: false, ..Default::default() }, + }; + + let result = variant_get(&variant_array_ref, strict_options); + // Should fail with a cast error + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(matches!(error, ArrowError::CastError(_))); + assert!(error.to_string().contains("Cannot access field 'nonexistent_field' on non-struct type")); + } + + #[test] + fn test_null_buffer_union_for_shredded_paths() { + use arrow::compute::CastOptions; Review Comment: Do we need to add another row where path `a.x` does not exist -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
