Blizzara commented on code in PR #12245:
URL: https://github.com/apache/datafusion/pull/12245#discussion_r1745444635
##########
datafusion/substrait/src/logical_plan/consumer.rs:
##########
@@ -850,6 +864,79 @@ pub async fn from_substrait_rel(
}
}
+/// Ensures that the given Substrait schema is compatible with the schema as
given by DataFusion
+///
+/// This means:
+/// 1. All fields present in the Substrait schema are present in the
DataFusion schema. The
+/// DataFusion schema may have MORE fields, but not the other way around.
+/// 2. All fields are compatible. See [`ensure_field_compatability`] for
details
+///
+/// This function returns a DataFrame with fields adjusted if necessary in the
event that the
+/// Substrait schema is a subset of the DataFusion schema.
+fn ensure_schema_compatability(
+ table: DataFrame,
+ substrait_schema: DFSchema,
+) -> Result<DataFrame> {
+ // For now strip the qualifiers from the DataFusion and Substrait schemas
to make comparisons
+ // easier as there are issues around qualifier case.
+ // TODO: ensure that qualifiers are the same
+ let df_schema = table.schema().to_owned().strip_qualifiers();
+ if df_schema.logically_equivalent_names_and_types(&substrait_schema) {
+ return Ok(table);
+ }
+ let selected_columns = substrait_schema
+ .strip_qualifiers()
+ .fields()
+ .iter()
+ .map(|substrait_field| {
+ let df_field =
+ df_schema.field_with_unqualified_name(substrait_field.name())?;
+ ensure_field_compatability(df_field, substrait_field)?;
+ Ok(col(format!("\"{}\"", df_field.name())))
+ })
+ .collect::<Result<_>>()?;
+
+ table.select(selected_columns)
+}
+
+/// Ensures that the given Substrait field is compatible with the given
DataFusion field
+///
+/// A field is compatible between Substrait and DataFusion if:
+/// 1. They have logically equivalent types.
+/// 2. They have the same nullability OR the Substrait field is nullable and
the DataFusion fields
+/// is not nullable.
+///
+/// If a Substrait field is not nullable, the Substrait plan may be built
around assuming it is not
+/// nullable. As such if DataFusion has that field as not nullable the plan
should be rejected.
+fn ensure_field_compatability(
+ datafusion_field: &Field,
+ substrait_field: &Field,
+) -> Result<()> {
+ if DFSchema::datatype_is_logically_equal(
+ datafusion_field.data_type(),
+ substrait_field.data_type(),
+ ) {
+ if (datafusion_field.is_nullable() == substrait_field.is_nullable())
+ || (substrait_field.is_nullable() &&
!datafusion_field.is_nullable())
+ {
+ Ok(())
+ } else {
+ // TODO: from_substrait_struct_type needs to be updated to set the
nullability correctly. It defaults to true for now.
+ substrait_err!(
+ "Field '{}' is nullable in the Substrait schema but not
nullable in the DataFusion schema.",
Review Comment:
Isn't this the other way around? Nullable in DF but not nullable in Substrait
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]