adriangb commented on code in PR #19674:
URL: https://github.com/apache/datafusion/pull/19674#discussion_r2696771061


##########
datafusion/common/src/nested_struct.rs:
##########
@@ -165,6 +197,32 @@ pub fn cast_column(
     }
 }
 
+/// Cast a struct array to another struct type by aligning child arrays using
+/// field names instead of their physical order.
+///
+/// This is a convenience wrapper around the internal function 
`cast_struct_column` that accepts
+/// `Fields` directly instead of requiring a `Field` wrapper.
+///
+/// See [`cast_column`] for detailed documentation on the casting behavior.
+///
+/// # Arguments
+/// * `array` - The source array to cast (must be a struct array)
+/// * `target_fields` - The target struct field definitions
+/// * `cast_options` - Options controlling cast behavior (strictness, 
formatting)
+///
+/// # Returns
+/// A `Result<ArrayRef>` containing the cast struct array
+///
+/// # Errors
+/// Returns an error if the source is not a struct array or if field casting 
fails
+pub fn cast_struct_array_by_name(

Review Comment:
   Is this really necessary to add to the public API?



##########
datafusion/physical-expr/src/expressions/cast.rs:
##########
@@ -237,6 +237,11 @@ pub fn cast_with_options(
         Ok(Arc::clone(&expr))
     } else if can_cast_types(&expr_type, &cast_type) {
         Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options)))
+    } else if matches!((&expr_type, &cast_type), (Struct(_), Struct(_))) {
+        // Allow struct-to-struct casts even if Arrow's can_cast_types rejects 
them
+        // (e.g., field count mismatches). These will be handled by name-based 
casting
+        // at execution time via ColumnarValue::cast_to

Review Comment:
   Shouldn't we be applying the same rules here? It seems unfortunate that a 
lot of cases will succeed at planning time but fail at runtime



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -641,6 +641,20 @@ impl ConstEvaluator {
             Expr::ScalarFunction(ScalarFunction { func, .. }) => {
                 Self::volatility_ok(func.signature().volatility)
             }
+            Expr::Cast(Cast { expr, data_type })
+            | Expr::TryCast(TryCast { expr, data_type }) => {
+                if let (
+                    Ok(DataType::Struct(source_fields)),
+                    DataType::Struct(target_fields),
+                ) = (expr.get_type(&DFSchema::empty()), data_type)
+                {
+                    // Don't const-fold struct casts with different field 
counts
+                    if source_fields.len() != target_fields.len() {
+                        return false;
+                    }
+                }
+                true
+            }

Review Comment:
   Is this tested anywhere?



##########
datafusion/expr-common/src/type_coercion/binary.rs:
##########
@@ -1236,30 +1237,84 @@ fn coerce_numeric_type_to_decimal256(numeric_type: 
&DataType) -> Option<DataType
 
 fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> 
Option<DataType> {
     use arrow::datatypes::DataType::*;
+
     match (lhs_type, rhs_type) {
         (Struct(lhs_fields), Struct(rhs_fields)) => {
+            // Field count must match for coercion
             if lhs_fields.len() != rhs_fields.len() {
                 return None;
             }
 
-            let coerced_types = std::iter::zip(lhs_fields.iter(), 
rhs_fields.iter())
-                .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), 
rhs.data_type()))
-                .collect::<Option<Vec<DataType>>>()?;
-
-            // preserve the field name and nullability
-            let orig_fields = std::iter::zip(lhs_fields.iter(), 
rhs_fields.iter());
+            // If the two structs have exactly the same set of field names 
(possibly in
+            // different order), prefer name-based coercion. Otherwise fall 
back to
+            // positional coercion which preserves backward compatibility.
+            if fields_have_same_names(lhs_fields, rhs_fields) {
+                return coerce_struct_by_name(lhs_fields, rhs_fields);
+            }
 
-            let fields: Vec<FieldRef> = coerced_types
-                .into_iter()
-                .zip(orig_fields)
-                .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, 
rhs))
-                .collect();
-            Some(Struct(fields.into()))
+            coerce_struct_by_position(lhs_fields, rhs_fields)
         }
         _ => None,
     }
 }
 
+/// Return true if every left-field name exists in the right fields (and 
lengths are equal).
+fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool {
+    let rhs_names: HashSet<&str> = rhs_fields.iter().map(|f| 
f.name().as_str()).collect();

Review Comment:
   I think uniqueness of field names in structs is enforced elsewhere, but 
maybe we could add a comment here saying as to why we don't have to worry about 
`struct<c1 intt>` -> struct<c1 int, c1 int>`



##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -409,6 +409,9 @@ impl Optimizer {
                     }
                     // OptimizerRule was unsuccessful, but skipped failed 
rules is off, return error
                     (Err(e), None) => {
+                        if matches!(e, DataFusionError::Plan(_)) {
+                            return Err(e);
+                        }

Review Comment:
   This is a change for all optimizer rule failures right? Seems a bit broad to 
include in this PR. Maybe we could merge it as its own commit first?



##########
datafusion/common/src/nested_struct.rs:
##########
@@ -204,51 +262,105 @@ pub fn validate_struct_compatibility(
     source_fields: &[FieldRef],
     target_fields: &[FieldRef],
 ) -> Result<()> {
+    let has_overlap = fields_have_name_overlap(source_fields, target_fields);
+    if !has_overlap {
+        if source_fields.len() != target_fields.len() {
+            return _plan_err!(
+                "Cannot cast struct with {} fields to {} fields without name 
overlap; positional mapping is ambiguous",
+                source_fields.len(),
+                target_fields.len()
+            );
+        }
+
+        for (source_field, target_field) in 
source_fields.iter().zip(target_fields.iter())
+        {
+            validate_field_compatibility(source_field, target_field)?;
+        }
+
+        return Ok(());
+    }
+
     // Check compatibility for each target field
     for target_field in target_fields {
         // Look for matching field in source by name
         if let Some(source_field) = source_fields
             .iter()
             .find(|f| f.name() == target_field.name())
         {
-            // Ensure nullability is compatible. It is invalid to cast a 
nullable
-            // source field to a non-nullable target field as this may discard
-            // null values.
-            if source_field.is_nullable() && !target_field.is_nullable() {
+            validate_field_compatibility(source_field, target_field)?;
+        } else {
+            // Target field is missing from source
+            // If it's non-nullable, we cannot fill it with NULL
+            if !target_field.is_nullable() {
                 return _plan_err!(
-                    "Cannot cast nullable struct field '{}' to non-nullable 
field",
+                    "Cannot cast struct: target field '{}' is non-nullable but 
missing from source. \
+                     Cannot fill with NULL.",
                     target_field.name()
                 );
             }
-            // Check if the matching field types are compatible
-            match (source_field.data_type(), target_field.data_type()) {
-                // Recursively validate nested structs
-                (Struct(source_nested), Struct(target_nested)) => {
-                    validate_struct_compatibility(source_nested, 
target_nested)?;
-                }
-                // For non-struct types, use the existing castability check
-                _ => {
-                    if !arrow::compute::can_cast_types(
-                        source_field.data_type(),
-                        target_field.data_type(),
-                    ) {
-                        return _plan_err!(
-                            "Cannot cast struct field '{}' from type {} to 
type {}",
-                            target_field.name(),
-                            source_field.data_type(),
-                            target_field.data_type()
-                        );
-                    }
-                }
-            }
         }
-        // Missing fields in source are OK - they'll be filled with nulls
     }
 
     // Extra fields in source are OK - they'll be ignored
     Ok(())
 }
 
+fn validate_field_compatibility(
+    source_field: &Field,
+    target_field: &Field,
+) -> Result<()> {
+    if source_field.data_type() == &DataType::Null {
+        return Ok(());
+    }

Review Comment:
   I think the check below still needs to run. What if the target field is not 
nullable?



##########
datafusion/common/src/nested_struct.rs:
##########
@@ -54,16 +55,38 @@ fn cast_struct_column(
     target_fields: &[Arc<Field>],
     cast_options: &CastOptions,
 ) -> Result<ArrayRef> {
+    if source_col.data_type() == &DataType::Null
+        || (!source_col.is_empty() && source_col.null_count() == 
source_col.len())
+    {
+        return Ok(new_null_array(
+            &Struct(target_fields.to_vec().into()),
+            source_col.len(),
+        ));
+    }

Review Comment:
   This seems unnecessary if called from `cast_column`. Maybe document that 
this is needed if `cast_struct_column` gets called directly (if it ever is)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to