scovich commented on code in PR #8343:
URL: https://github.com/apache/arrow-rs/pull/8343#discussion_r2348955223


##########
parquet-variant-compute/src/variant_get/mod.rs:
##########
@@ -1991,4 +1956,499 @@ mod test {
         // Row 8: Large Int64 should fail conversion -> NULL
         assert!(int32_result_2.is_null(8)); // 9223372036854775807 (too large 
for Int32)
     }
+
+    #[test]
+    fn test_struct_extraction_subset_superset_schema_perfectly_shredded() {
+        // Create variant with diverse null patterns and empty objects
+        let variant_array = create_comprehensive_shredded_variant();
+
+        // Request struct with fields "a", "b", "d" (skip existing "c", add 
missing "d")
+        let struct_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("d", DataType::Int32, true),
+        ]);
+        let struct_type = DataType::Struct(struct_fields);
+
+        let options = GetOptions {
+            path: VariantPath::default(),
+            as_type: Some(Arc::new(Field::new("result", struct_type, true))),
+            cast_options: CastOptions::default(),
+        };
+
+        let result = variant_get(&variant_array, options).unwrap();
+
+        // Verify the result is a StructArray with 3 fields and 5 rows
+        let struct_result = 
result.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(struct_result.len(), 5);
+        assert_eq!(struct_result.num_columns(), 3);
+
+        let field_a = struct_result
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        let field_b = struct_result
+            .column(1)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        let field_d = struct_result
+            .column(2)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+
+        // Row 0: Normal values {"a": 1, "b": 2, "c": 3} → {a: 1, b: 2, d: 
NULL}
+        assert!(!struct_result.is_null(0));
+        assert_eq!(field_a.value(0), 1);
+        assert_eq!(field_b.value(0), 2);
+        assert!(field_d.is_null(0)); // Missing field "d"
+
+        // Row 1: Top-level NULL → struct-level NULL
+        assert!(struct_result.is_null(1));
+
+        // Row 2: Field "a" missing → {a: NULL, b: 2, d: NULL}
+        assert!(!struct_result.is_null(2));
+        assert!(field_a.is_null(2)); // Missing field "a"
+        assert_eq!(field_b.value(2), 2);
+        assert!(field_d.is_null(2)); // Missing field "d"
+
+        // Row 3: Field "b" missing → {a: 1, b: NULL, d: NULL}
+        assert!(!struct_result.is_null(3));
+        assert_eq!(field_a.value(3), 1);
+        assert!(field_b.is_null(3)); // Missing field "b"
+        assert!(field_d.is_null(3)); // Missing field "d"
+
+        // Row 4: Empty object {} → {a: NULL, b: NULL, d: NULL}
+        assert!(!struct_result.is_null(4));
+        assert!(field_a.is_null(4)); // Empty object
+        assert!(field_b.is_null(4)); // Empty object
+        assert!(field_d.is_null(4)); // Missing field "d"
+    }
+
+    #[test]
+    fn test_nested_struct_extraction_perfectly_shredded() {
+        // Create nested variant with diverse null patterns
+        let variant_array = create_comprehensive_nested_shredded_variant();
+        println!("variant_array: {variant_array:?}");
+
+        // Request 3-level nested struct type {"outer": {"inner": INT}}
+        let inner_field = Field::new("inner", DataType::Int32, true);
+        let inner_type = DataType::Struct(Fields::from(vec![inner_field]));
+        let outer_field = Field::new("outer", inner_type, true);
+        let result_type = DataType::Struct(Fields::from(vec![outer_field]));
+
+        let options = GetOptions {
+            path: VariantPath::default(),
+            as_type: Some(Arc::new(Field::new("result", result_type, true))),
+            cast_options: CastOptions::default(),
+        };
+
+        let result = variant_get(&variant_array, options).unwrap();
+        println!("result: {result:?}");
+
+        // Verify the result is a StructArray with "outer" field and 4 rows
+        let outer_struct = 
result.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(outer_struct.len(), 4);
+        assert_eq!(outer_struct.num_columns(), 1);
+
+        // Get the "inner" struct column
+        let inner_struct = outer_struct
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+        assert_eq!(inner_struct.num_columns(), 1);
+
+        // Get the "leaf" field (Int32 values)
+        let leaf_field = inner_struct
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+
+        // Row 0: Normal nested {"outer": {"inner": {"leaf": 42}}}
+        assert!(!outer_struct.is_null(0));
+        assert!(!inner_struct.is_null(0));
+        assert_eq!(leaf_field.value(0), 42);
+
+        // Row 1: "inner" field missing → {outer: {inner: NULL}}
+        assert!(!outer_struct.is_null(1));
+        assert!(!inner_struct.is_null(1)); // outer exists, inner exists but 
leaf is NULL
+        assert!(leaf_field.is_null(1)); // leaf field is NULL
+
+        // Row 2: "outer" field missing → {outer: NULL}
+        assert!(!outer_struct.is_null(2));
+        assert!(inner_struct.is_null(2)); // outer field is NULL
+
+        // Row 3: Top-level NULL → struct-level NULL
+        assert!(outer_struct.is_null(3));
+    }
+
+    #[test]
+    fn test_path_based_null_masks_one_step() {
+        // Create nested variant with diverse null patterns
+        let variant_array = create_comprehensive_nested_shredded_variant();
+
+        // Extract "outer" field using path-based variant_get
+        let path = VariantPath::from("outer");
+        let inner_field = Field::new("inner", DataType::Int32, true);
+        let result_type = DataType::Struct(Fields::from(vec![inner_field]));
+
+        let options = GetOptions {
+            path,
+            as_type: Some(Arc::new(Field::new("result", result_type, true))),
+            cast_options: CastOptions::default(),
+        };
+
+        let result = variant_get(&variant_array, options).unwrap();
+
+        // Verify the result is a StructArray with "inner" field and 4 rows
+        let outer_result = 
result.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(outer_result.len(), 4);
+        assert_eq!(outer_result.num_columns(), 1);
+
+        // Get the "inner" field (Int32 values)
+        let inner_field = outer_result
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+
+        // Row 0: Normal nested {"outer": {"inner": 42}} → {"inner": 42}
+        assert!(!outer_result.is_null(0));
+        assert_eq!(inner_field.value(0), 42);
+
+        // Row 1: Inner field null {"outer": {"inner": null}} → {"inner": null}
+        assert!(!outer_result.is_null(1));
+        assert!(inner_field.is_null(1));
+
+        // Row 2: Outer field null {"outer": null} → null (entire struct is 
null)
+        assert!(outer_result.is_null(2));
+
+        // Row 3: Top-level null → null (entire struct is null)
+        assert!(outer_result.is_null(3));
+    }
+
+    #[test]
+    fn test_path_based_null_masks_two_steps() {
+        // Create nested variant with diverse null patterns
+        let variant_array = create_comprehensive_nested_shredded_variant();
+
+        // Extract "outer.inner" field using path-based variant_get
+        let path = VariantPath::from("outer").join("inner");
+
+        let options = GetOptions {
+            path,
+            as_type: Some(Arc::new(Field::new("result", DataType::Int32, 
true))),
+            cast_options: CastOptions::default(),
+        };
+
+        let result = variant_get(&variant_array, options).unwrap();
+
+        // Verify the result is an Int32Array with 4 rows
+        let int_result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(int_result.len(), 4);
+
+        // Row 0: Normal nested {"outer": {"inner": 42}} → 42
+        assert!(!int_result.is_null(0));
+        assert_eq!(int_result.value(0), 42);
+
+        // Row 1: Inner field null {"outer": {"inner": null}} → null
+        assert!(int_result.is_null(1));
+
+        // Row 2: Outer field null {"outer": null} → null (path traversal 
fails)
+        assert!(int_result.is_null(2));
+
+        // Row 3: Top-level null → null (path traversal fails)
+        assert!(int_result.is_null(3));
+    }
+
+    #[test]
+    fn test_struct_extraction_mixed_and_unshredded() {
+        // Create a partially shredded variant (x shredded, y not)
+        let variant_array = create_mixed_and_unshredded_variant();
+
+        // Request struct with both shredded and unshredded fields
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, true),
+            Field::new("y", DataType::Int32, true),
+        ]);
+        let struct_type = DataType::Struct(struct_fields);
+
+        let options = GetOptions {
+            path: VariantPath::default(),
+            as_type: Some(Arc::new(Field::new("result", struct_type, true))),
+            cast_options: CastOptions::default(),
+        };
+
+        let result = variant_get(&variant_array, options).unwrap();
+
+        // Verify the mixed shredding works (should succeed with current 
implementation)
+        let struct_result = 
result.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(struct_result.len(), 4);
+        assert_eq!(struct_result.num_columns(), 2);
+
+        let field_x = struct_result
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        let field_y = struct_result
+            .column(1)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+
+        // Row 0: {"x": 1, "y": 42} - x from shredded, y from value field
+        assert_eq!(field_x.value(0), 1);
+        assert_eq!(field_y.value(0), 42);
+
+        // Row 1: {"x": 2} - x from shredded, y missing (perfect shredding)
+        assert_eq!(field_x.value(1), 2);
+        assert!(field_y.is_null(1));
+
+        // Row 2: {"x": 3, "y": null} - x from shredded, y explicitly null in 
value
+        assert_eq!(field_x.value(2), 3);
+        assert!(field_y.is_null(2));
+
+        // Row 3: top-level null - entire struct row should be null
+        assert!(struct_result.is_null(3));
+    }
+
+    /// Test that demonstrates the actual struct row builder gap
+    /// This test should fail because it hits unshredded nested structs
+    #[test]
+    fn test_struct_row_builder_gap_demonstration() {
+        // Create completely unshredded JSON variant (no typed_value at all)
+        let json_strings = vec![
+            r#"{"outer": {"inner": 42}}"#,
+            r#"{"outer": {"inner": 100}}"#,
+        ];
+        let string_array: Arc<dyn Array> = 
Arc::new(StringArray::from(json_strings));
+        let variant_array = json_to_variant(&string_array).unwrap();
+
+        // Request nested struct - this should fail at the row builder level
+        let inner_fields = Fields::from(vec![Field::new("inner", 
DataType::Int32, true)]);
+        let inner_struct_type = DataType::Struct(inner_fields);
+        let outer_fields = Fields::from(vec![Field::new("outer", 
inner_struct_type, true)]);
+        let outer_struct_type = DataType::Struct(outer_fields);
+
+        let options = GetOptions {
+            path: VariantPath::default(),
+            as_type: Some(Arc::new(Field::new("result", outer_struct_type, 
true))),
+            cast_options: CastOptions::default(),
+        };
+
+        let variant_array_ref: Arc<dyn Array> = Arc::new(variant_array);
+        let result = variant_get(&variant_array_ref, options);
+
+        // Should fail with NotYetImplemented when the row builder tries to 
handle struct type
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error.to_string().contains("not yet implemented"));
+    }
+
+    /// Create comprehensive shredded variant with diverse null patterns and 
empty objects
+    /// Rows: normal values, top-level null, missing field a, missing field b, 
empty object
+    fn create_comprehensive_shredded_variant() -> ArrayRef {
+        let (metadata, _) = {
+            let mut builder = parquet_variant::VariantBuilder::new();
+            let obj = builder.new_object();
+            obj.finish();
+            builder.finish()
+        };
+
+        // Create null buffer for top-level nulls
+        let nulls = NullBuffer::from(vec![
+            true,  // row 0: normal values
+            false, // row 1: top-level null
+            true,  // row 2: missing field a
+            true,  // row 3: missing field b
+            true,  // row 4: empty object
+        ]);
+
+        let metadata_array = 
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 5));
+
+        // Create shredded fields with different null patterns
+        // Field "a": present in rows 0,3 (missing in rows 1,2,4)
+        let a_field_typed_value = Int32Array::from(vec![Some(1), None, None, 
Some(1), None]);
+        let a_field_struct = StructArrayBuilder::new()
+            .with_field("typed_value", Arc::new(a_field_typed_value), true)
+            .build();
+        let a_field_shredded = 
ShreddedVariantFieldArray::try_new(Arc::new(a_field_struct))
+            .expect("should create ShreddedVariantFieldArray for a");
+
+        // Field "b": present in rows 0,2 (missing in rows 1,3,4)
+        let b_field_typed_value = Int32Array::from(vec![Some(2), None, 
Some(2), None, None]);
+        let b_field_struct = StructArrayBuilder::new()
+            .with_field("typed_value", Arc::new(b_field_typed_value), true)
+            .build();
+        let b_field_shredded = 
ShreddedVariantFieldArray::try_new(Arc::new(b_field_struct))
+            .expect("should create ShreddedVariantFieldArray for b");
+
+        // Field "c": present in row 0 only (missing in all other rows)
+        let c_field_typed_value = Int32Array::from(vec![Some(3), None, None, 
None, None]);
+        let c_field_struct = StructArrayBuilder::new()
+            .with_field("typed_value", Arc::new(c_field_typed_value), true)
+            .build();
+        let c_field_shredded = 
ShreddedVariantFieldArray::try_new(Arc::new(c_field_struct))
+            .expect("should create ShreddedVariantFieldArray for c");
+
+        // Create main typed_value struct
+        let typed_value_fields = Fields::from(vec![
+            Field::new("a", a_field_shredded.data_type().clone(), true),
+            Field::new("b", b_field_shredded.data_type().clone(), true),
+            Field::new("c", c_field_shredded.data_type().clone(), true),
+        ]);
+        let typed_value_struct = StructArray::try_new(
+            typed_value_fields,
+            vec![
+                Arc::new(a_field_shredded),
+                Arc::new(b_field_shredded),
+                Arc::new(c_field_shredded),
+            ],
+            None,
+        )
+        .unwrap();
+
+        // Build final VariantArray with top-level nulls
+        let struct_array = StructArrayBuilder::new()
+            .with_field("metadata", Arc::new(metadata_array), false)
+            .with_field("typed_value", Arc::new(typed_value_struct), true)
+            .with_nulls(nulls)
+            .build();
+
+        Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should 
create VariantArray"))
+    }
+
+    /// Create comprehensive nested shredded variant with diverse null patterns
+    /// Represents 3-level structure: variant -> outer -> inner (INT value)
+    /// The shredding schema is: {"metadata": BINARY, "typed_value": {"outer": 
{"typed_value": {"inner": {"typed_value": INT}}}}}
+    /// Rows: normal nested value, inner field null, outer field null, 
top-level null
+    fn create_comprehensive_nested_shredded_variant() -> ArrayRef {
+        // Create the inner level: contains typed_value with Int32 values
+        // Row 0: has value 42, Row 1: inner null, Row 2: outer null, Row 3: 
top-level null
+        let inner_typed_value = Int32Array::from(vec![Some(42), None, None, 
None]); // dummy value for row 2
+        let inner = StructArrayBuilder::new()
+            .with_field("typed_value", Arc::new(inner_typed_value), true)
+            .build();
+        let inner = 
ShreddedVariantFieldArray::try_new(Arc::new(inner)).unwrap();
+
+        let outer_typed_value_nulls = NullBuffer::from(vec![
+            true,  // row 0: inner struct exists with typed_value=42
+            false, // row 1: inner field NULL
+            false, // row 2: outer field NULL
+            false, // row 3: top-level NULL
+        ]);
+        let outer_typed_value = StructArrayBuilder::new()
+            .with_field("inner", Arc::new(inner), false)
+            .with_nulls(outer_typed_value_nulls)
+            .build();
+
+        let outer = StructArrayBuilder::new()
+            .with_field("typed_value", Arc::new(outer_typed_value), true)
+            .build();
+        let outer = 
ShreddedVariantFieldArray::try_new(Arc::new(outer)).unwrap();
+
+        let typed_value_nulls = NullBuffer::from(vec![
+            true,  // row 0: inner struct exists with typed_value=42
+            true,  // row 1: inner field NULL
+            false, // row 2: outer field NULL
+            false, // row 3: top-level NULL
+        ]);
+        let typed_value = StructArrayBuilder::new()
+            .with_field("outer", Arc::new(outer), false)
+            .with_nulls(typed_value_nulls)
+            .build();
+
+        // Build final VariantArray with top-level nulls
+        let (metadata, _) = parquet_variant::VariantBuilder::new().finish();
+        let metadata_array = 
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
+        let nulls = NullBuffer::from(vec![
+            true,  // row 0: inner struct exists with typed_value=42
+            true,  // row 1: inner field NULL
+            true,  // row 2: outer field NULL
+            false, // row 3: top-level NULL
+        ]);
+        let struct_array = StructArrayBuilder::new()
+            .with_field("metadata", Arc::new(metadata_array), false)
+            .with_field("typed_value", Arc::new(typed_value), true)
+            .with_nulls(nulls)
+            .build();
+
+        Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should 
create VariantArray"))
+    }
+
+    /// Create variant with mixed shredding (spec-compliant) including null 
scenarios
+    /// Field "x" is globally shredded, field "y" is never shredded
+    fn create_mixed_and_unshredded_variant() -> ArrayRef {
+        // Create spec-compliant mixed shredding:
+        // - Field "x" is globally shredded (has typed_value column)
+        // - Field "y" is never shredded (only appears in value field when 
present)
+
+        let (metadata, y_field_value) = {
+            let mut builder = parquet_variant::VariantBuilder::new();
+            let mut obj = builder.new_object();
+            obj.insert("y", Variant::from(42));
+            obj.finish();
+            builder.finish()
+        };
+
+        let metadata_array = 
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
+
+        // Value field contains objects with unshredded fields only (never 
contains "x")
+        // Row 0: {"y": "foo"} - x is shredded out, y remains in value
+        // Row 1: {} - both x and y are absent (perfect shredding for x, y 
missing)
+        // Row 2: {"y": null} - x is shredded out, y explicitly null
+        // Row 3: top-level null (encoded in VariantArray's null mask, but 
fields contain valid data)

Review Comment:
   on null rows, the array can contain any physically valid value; it will 
anyway be ignored



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to