adriangb commented on code in PR #19674:
URL: https://github.com/apache/datafusion/pull/19674#discussion_r2696771061
##########
datafusion/common/src/nested_struct.rs:
##########
@@ -165,6 +197,32 @@ pub fn cast_column(
}
}
+/// Cast a struct array to another struct type by aligning child arrays using
+/// field names instead of their physical order.
+///
+/// This is a convenience wrapper around the internal function
`cast_struct_column` that accepts
+/// `Fields` directly instead of requiring a `Field` wrapper.
+///
+/// See [`cast_column`] for detailed documentation on the casting behavior.
+///
+/// # Arguments
+/// * `array` - The source array to cast (must be a struct array)
+/// * `target_fields` - The target struct field definitions
+/// * `cast_options` - Options controlling cast behavior (strictness,
formatting)
+///
+/// # Returns
+/// A `Result<ArrayRef>` containing the cast struct array
+///
+/// # Errors
+/// Returns an error if the source is not a struct array or if field casting
fails
+pub fn cast_struct_array_by_name(
Review Comment:
Is this really necessary to add to the public API?
##########
datafusion/physical-expr/src/expressions/cast.rs:
##########
@@ -237,6 +237,11 @@ pub fn cast_with_options(
Ok(Arc::clone(&expr))
} else if can_cast_types(&expr_type, &cast_type) {
Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options)))
+ } else if matches!((&expr_type, &cast_type), (Struct(_), Struct(_))) {
+ // Allow struct-to-struct casts even if Arrow's can_cast_types rejects
them
+ // (e.g., field count mismatches). These will be handled by name-based
casting
+ // at execution time via ColumnarValue::cast_to
Review Comment:
Shouldn't we be applying the same rules here? It seems unfortunate that a
lot of cases will succeed at planning time but fail at runtime
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -641,6 +641,20 @@ impl ConstEvaluator {
Expr::ScalarFunction(ScalarFunction { func, .. }) => {
Self::volatility_ok(func.signature().volatility)
}
+ Expr::Cast(Cast { expr, data_type })
+ | Expr::TryCast(TryCast { expr, data_type }) => {
+ if let (
+ Ok(DataType::Struct(source_fields)),
+ DataType::Struct(target_fields),
+ ) = (expr.get_type(&DFSchema::empty()), data_type)
+ {
+ // Don't const-fold struct casts with different field
counts
+ if source_fields.len() != target_fields.len() {
+ return false;
+ }
+ }
+ true
+ }
Review Comment:
Is this tested anywhere?
##########
datafusion/expr-common/src/type_coercion/binary.rs:
##########
@@ -1236,30 +1237,84 @@ fn coerce_numeric_type_to_decimal256(numeric_type:
&DataType) -> Option<DataType
fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
use arrow::datatypes::DataType::*;
+
match (lhs_type, rhs_type) {
(Struct(lhs_fields), Struct(rhs_fields)) => {
+ // Field count must match for coercion
if lhs_fields.len() != rhs_fields.len() {
return None;
}
- let coerced_types = std::iter::zip(lhs_fields.iter(),
rhs_fields.iter())
- .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(),
rhs.data_type()))
- .collect::<Option<Vec<DataType>>>()?;
-
- // preserve the field name and nullability
- let orig_fields = std::iter::zip(lhs_fields.iter(),
rhs_fields.iter());
+ // If the two structs have exactly the same set of field names
(possibly in
+ // different order), prefer name-based coercion. Otherwise fall
back to
+ // positional coercion which preserves backward compatibility.
+ if fields_have_same_names(lhs_fields, rhs_fields) {
+ return coerce_struct_by_name(lhs_fields, rhs_fields);
+ }
- let fields: Vec<FieldRef> = coerced_types
- .into_iter()
- .zip(orig_fields)
- .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs,
rhs))
- .collect();
- Some(Struct(fields.into()))
+ coerce_struct_by_position(lhs_fields, rhs_fields)
}
_ => None,
}
}
+/// Return true if every left-field name exists in the right fields (and
lengths are equal).
+fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool {
+ let rhs_names: HashSet<&str> = rhs_fields.iter().map(|f|
f.name().as_str()).collect();
Review Comment:
I think uniqueness of field names in structs is enforced elsewhere, but
maybe we could add a comment here saying as to why we don't have to worry about
`struct<c1 intt>` -> struct<c1 int, c1 int>`
##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -409,6 +409,9 @@ impl Optimizer {
}
// OptimizerRule was unsuccessful, but skipped failed
rules is off, return error
(Err(e), None) => {
+ if matches!(e, DataFusionError::Plan(_)) {
+ return Err(e);
+ }
Review Comment:
This is a change for all optimizer rule failures right? Seems a bit broad to
include in this PR. Maybe we could merge it as its own commit first?
##########
datafusion/common/src/nested_struct.rs:
##########
@@ -204,51 +262,105 @@ pub fn validate_struct_compatibility(
source_fields: &[FieldRef],
target_fields: &[FieldRef],
) -> Result<()> {
+ let has_overlap = fields_have_name_overlap(source_fields, target_fields);
+ if !has_overlap {
+ if source_fields.len() != target_fields.len() {
+ return _plan_err!(
+ "Cannot cast struct with {} fields to {} fields without name
overlap; positional mapping is ambiguous",
+ source_fields.len(),
+ target_fields.len()
+ );
+ }
+
+ for (source_field, target_field) in
source_fields.iter().zip(target_fields.iter())
+ {
+ validate_field_compatibility(source_field, target_field)?;
+ }
+
+ return Ok(());
+ }
+
// Check compatibility for each target field
for target_field in target_fields {
// Look for matching field in source by name
if let Some(source_field) = source_fields
.iter()
.find(|f| f.name() == target_field.name())
{
- // Ensure nullability is compatible. It is invalid to cast a
nullable
- // source field to a non-nullable target field as this may discard
- // null values.
- if source_field.is_nullable() && !target_field.is_nullable() {
+ validate_field_compatibility(source_field, target_field)?;
+ } else {
+ // Target field is missing from source
+ // If it's non-nullable, we cannot fill it with NULL
+ if !target_field.is_nullable() {
return _plan_err!(
- "Cannot cast nullable struct field '{}' to non-nullable
field",
+ "Cannot cast struct: target field '{}' is non-nullable but
missing from source. \
+ Cannot fill with NULL.",
target_field.name()
);
}
- // Check if the matching field types are compatible
- match (source_field.data_type(), target_field.data_type()) {
- // Recursively validate nested structs
- (Struct(source_nested), Struct(target_nested)) => {
- validate_struct_compatibility(source_nested,
target_nested)?;
- }
- // For non-struct types, use the existing castability check
- _ => {
- if !arrow::compute::can_cast_types(
- source_field.data_type(),
- target_field.data_type(),
- ) {
- return _plan_err!(
- "Cannot cast struct field '{}' from type {} to
type {}",
- target_field.name(),
- source_field.data_type(),
- target_field.data_type()
- );
- }
- }
- }
}
- // Missing fields in source are OK - they'll be filled with nulls
}
// Extra fields in source are OK - they'll be ignored
Ok(())
}
+fn validate_field_compatibility(
+ source_field: &Field,
+ target_field: &Field,
+) -> Result<()> {
+ if source_field.data_type() == &DataType::Null {
+ return Ok(());
+ }
Review Comment:
I think the check below still needs to run. What if the target field is not
nullable?
##########
datafusion/common/src/nested_struct.rs:
##########
@@ -54,16 +55,38 @@ fn cast_struct_column(
target_fields: &[Arc<Field>],
cast_options: &CastOptions,
) -> Result<ArrayRef> {
+ if source_col.data_type() == &DataType::Null
+ || (!source_col.is_empty() && source_col.null_count() ==
source_col.len())
+ {
+ return Ok(new_null_array(
+ &Struct(target_fields.to_vec().into()),
+ source_col.len(),
+ ));
+ }
Review Comment:
This seems unnecessary if called from `cast_column`. Maybe document that
this is needed if `cast_struct_column` gets called directly (if it ever is)?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]