adriangb commented on code in PR #19674:
URL: https://github.com/apache/datafusion/pull/19674#discussion_r2706697110


##########
datafusion/common/src/nested_struct.rs:
##########
@@ -31,6 +31,7 @@ use std::sync::Arc;
 ///
 /// ## Field Matching Strategy
 /// - **By Name**: Source struct fields are matched to target fields by name 
(case-sensitive)
+/// - **By Position**: When there is no name overlap and the field counts 
match, fields are cast by index

Review Comment:
   I think it's worth clarifying we're going to remove this behavior if we 
indeed plan on doing so. Basically why not deprecate it as best we can in this 
PR.



##########
datafusion/common/src/nested_struct.rs:
##########
@@ -54,16 +55,38 @@ fn cast_struct_column(
     target_fields: &[Arc<Field>],
     cast_options: &CastOptions,
 ) -> Result<ArrayRef> {
+    if source_col.data_type() == &DataType::Null
+        || (!source_col.is_empty() && source_col.null_count() == 
source_col.len())
+    {
+        return Ok(new_null_array(
+            &Struct(target_fields.to_vec().into()),
+            source_col.len(),
+        ));
+    }
+
     if let Some(source_struct) = 
source_col.as_any().downcast_ref::<StructArray>() {
-        validate_struct_compatibility(source_struct.fields(), target_fields)?;
+        let source_fields = source_struct.fields();
+        let has_overlap = fields_have_name_overlap(source_fields, 
target_fields);
+        validate_struct_compatibility(source_fields, target_fields)?;

Review Comment:
   Does this function need to be adjusted at all to take mapping of fields by 
name into account?



##########
datafusion/sqllogictest/test_files/struct.slt:
##########
@@ -590,11 +566,6 @@ select [{r: 'a', c: 1}, {r: 'b', c: 2}];
 ----
 [{r: a, c: 1}, {r: b, c: 2}]
 
-# Create a list of struct with different field types
-query ?
-select [{r: 'a', c: 1}, {c: 2, r: 'b'}];

Review Comment:
   I also don't see a new test with array literals (searched for `select [{`)



##########
datafusion/sqllogictest/test_files/struct.slt:
##########
@@ -492,8 +492,7 @@ Struct("r": Utf8, "c": Float64)
 statement ok
 drop table t;
 
-query error DataFusion error: Optimizer rule 'simplify_expressions' 
failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of 
Float64 type
-create table t as values({r: 'a', c: 1}), ({c: 2.3, r: 'b'});

Review Comment:
   I don't see a new test that checks implicit coercion during `create table 
... as values`. Am I missing something? I think it should be added back if not.



##########
datafusion/sqllogictest/test_files/struct.slt:
##########
@@ -607,18 +578,6 @@ List(Struct("r": Utf8View, "c": Float32))
 statement ok
 drop table t;
 
-# create table with different struct type is fine
-statement ok
-create table t(a struct(r varchar, c int), b struct(c float, r varchar)) as 
values (row('a', 1), row(2.3, 'b'));
-
-# create array with different struct type should be cast
-query T
-select arrow_typeof([a, b]) from t;

Review Comment:
   I don't actually see a test for the target type / casting of a dynamically 
created array (searched for `[a` and `arrow_typeof([`)



##########
datafusion/common/src/nested_struct.rs:
##########
@@ -204,51 +227,114 @@ pub fn validate_struct_compatibility(
     source_fields: &[FieldRef],
     target_fields: &[FieldRef],
 ) -> Result<()> {
+    let has_overlap = fields_have_name_overlap(source_fields, target_fields);
+    if !has_overlap {
+        if source_fields.len() != target_fields.len() {
+            return _plan_err!(
+                "Cannot cast struct with {} fields to {} fields without name 
overlap; positional mapping is ambiguous",
+                source_fields.len(),
+                target_fields.len()
+            );
+        }
+
+        for (source_field, target_field) in 
source_fields.iter().zip(target_fields.iter())
+        {
+            validate_field_compatibility(source_field, target_field)?;
+        }
+
+        return Ok(());
+    }
+
     // Check compatibility for each target field
     for target_field in target_fields {
         // Look for matching field in source by name
         if let Some(source_field) = source_fields
             .iter()
             .find(|f| f.name() == target_field.name())
         {
-            // Ensure nullability is compatible. It is invalid to cast a 
nullable
-            // source field to a non-nullable target field as this may discard
-            // null values.
-            if source_field.is_nullable() && !target_field.is_nullable() {
+            validate_field_compatibility(source_field, target_field)?;
+        } else {
+            // Target field is missing from source
+            // If it's non-nullable, we cannot fill it with NULL
+            if !target_field.is_nullable() {
                 return _plan_err!(
-                    "Cannot cast nullable struct field '{}' to non-nullable 
field",
+                    "Cannot cast struct: target field '{}' is non-nullable but 
missing from source. \
+                     Cannot fill with NULL.",
                     target_field.name()
                 );
             }
-            // Check if the matching field types are compatible
-            match (source_field.data_type(), target_field.data_type()) {
-                // Recursively validate nested structs
-                (Struct(source_nested), Struct(target_nested)) => {
-                    validate_struct_compatibility(source_nested, 
target_nested)?;
-                }
-                // For non-struct types, use the existing castability check
-                _ => {
-                    if !arrow::compute::can_cast_types(
-                        source_field.data_type(),
-                        target_field.data_type(),
-                    ) {
-                        return _plan_err!(
-                            "Cannot cast struct field '{}' from type {} to 
type {}",
-                            target_field.name(),
-                            source_field.data_type(),
-                            target_field.data_type()
-                        );
-                    }
-                }
-            }
         }
-        // Missing fields in source are OK - they'll be filled with nulls
     }
 
     // Extra fields in source are OK - they'll be ignored
     Ok(())
 }
 
+fn validate_field_compatibility(
+    source_field: &Field,
+    target_field: &Field,
+) -> Result<()> {
+    if source_field.data_type() == &DataType::Null {
+        // Validate that target allows nulls before returning early.
+        // It is invalid to cast a NULL source field to a non-nullable target 
field.
+        if !target_field.is_nullable() {
+            return _plan_err!(
+                "Cannot cast NULL struct field '{}' to non-nullable field 
'{}'",
+                source_field.name(),
+                target_field.name()
+            );
+        }
+        return Ok(());
+    }
+
+    // Ensure nullability is compatible. It is invalid to cast a nullable
+    // source field to a non-nullable target field as this may discard
+    // null values.
+    if source_field.is_nullable() && !target_field.is_nullable() {
+        return _plan_err!(
+            "Cannot cast nullable struct field '{}' to non-nullable field",
+            target_field.name()
+        );
+    }
+
+    // Check if the matching field types are compatible
+    match (source_field.data_type(), target_field.data_type()) {
+        // Recursively validate nested structs
+        (Struct(source_nested), Struct(target_nested)) => {
+            validate_struct_compatibility(source_nested, target_nested)?;
+        }
+        // For non-struct types, use the existing castability check
+        _ => {
+            if !arrow::compute::can_cast_types(
+                source_field.data_type(),
+                target_field.data_type(),
+            ) {
+                return _plan_err!(
+                    "Cannot cast struct field '{}' from type {} to type {}",
+                    target_field.name(),
+                    source_field.data_type(),
+                    target_field.data_type()
+                );
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn fields_have_name_overlap(

Review Comment:
   I'd rename this to `has_one_or_more_common_fields` or something like that to 
make it clear the behavior in the name. Alternatively add a docstring.



##########
datafusion/sqllogictest/test_files/struct.slt:
##########
@@ -492,8 +492,7 @@ Struct("r": Utf8, "c": Float64)
 statement ok
 drop table t;
 
-query error DataFusion error: Optimizer rule 'simplify_expressions' 
failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of 
Float64 type
-create table t as values({r: 'a', c: 1}), ({c: 2.3, r: 'b'});

Review Comment:
   And big picture the old tests check implicit coercion by creating arrays or 
having mismatched types in a `values` clause. The new tests always use an 
explicit `cast` call.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to