This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d61c2f4 [SPARK-37490][SQL] Show extra hint if analyzer fails due to ANSI type coercion d61c2f4 is described below commit d61c2f45c3c1fa90aef7f7aff0d9f292edfd3083 Author: Gengliang Wang <gengli...@apache.org> AuthorDate: Wed Dec 1 12:45:04 2021 +0800 [SPARK-37490][SQL] Show extra hint if analyzer fails due to ANSI type coercion ### What changes were proposed in this pull request? Show extra hint in the error message if analysis failed only with ANSI type coercion: ``` To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error. ``` ### Why are the changes needed? Improve error message ### Does this PR introduce _any_ user-facing change? Yes, Spark will show extra hint if analyzer fails due to ANSI type coercion ### How was this patch tested? Unit tests Closes #34747 from gengliangwang/improveCoercionMsg. Authored-by: Gengliang Wang <gengli...@apache.org> Signed-off-by: Gengliang Wang <gengli...@apache.org> --- .../sql/catalyst/analysis/AnsiTypeCoercion.scala | 7 +- .../sql/catalyst/analysis/CheckAnalysis.scala | 111 +++++++++++++++++---- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 4 +- .../sql/catalyst/rules/RuleIdCollection.scala | 1 + .../resources/sql-tests/results/ansi/date.sql.out | 12 ++- .../sql-tests/results/ansi/interval.sql.out | 6 +- .../sql-tests/results/postgreSQL/union.sql.out | 1 + 7 files changed, 113 insertions(+), 29 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala index debc13b..267c2cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala @@ -75,7 +75,7 @@ import org.apache.spark.sql.types._ object AnsiTypeCoercion extends TypeCoercionBase { override def typeCoercionRules: List[Rule[LogicalPlan]] = WidenSetOperationTypes :: - CombinedTypeCoercionRule( + new AnsiCombinedTypeCoercionRule( InConversion :: PromoteStringLiterals :: DecimalPrecision :: @@ -304,4 +304,9 @@ object AnsiTypeCoercion extends TypeCoercionBase { s.copy(left = newLeft, right = newRight) } } + + // This is for generating a new rule id, so that we can run both default and Ansi + // type coercion rules against one logical plan. + class AnsiCombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends + CombinedTypeCoercionRule(rules) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 5bf37a2..491d525 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, StringUtils, TypeUtils} import org.apache.spark.sql.connector.catalog.{LookupCatalog, SupportsPartitionManagement} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} @@ -47,6 +48,8 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { */ val extendedCheckRules: Seq[LogicalPlan => Unit] = Nil + val DATA_TYPE_MISMATCH_ERROR = TreeNodeTag[Boolean]("dataTypeMismatchError") + protected def failAnalysis(msg: String): Nothing = { throw new AnalysisException(msg) } @@ -165,14 +168,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { } } - val exprs = operator match { - // `groupingExpressions` may rely on `aggregateExpressions`, due to the GROUP BY alias - // feature. We should check errors in `aggregateExpressions` first. - case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions - case _ => operator.expressions - } - - exprs.foreach(_.foreachUp { + getAllExpressions(operator).foreach(_.foreachUp { case a: Attribute if !a.resolved => val missingCol = a.sql val candidates = operator.inputSet.toSeq.map(_.qualifiedName) @@ -189,8 +185,10 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { case e: Expression if e.checkInputDataTypes().isFailure => e.checkInputDataTypes() match { case TypeCheckResult.TypeCheckFailure(message) => + e.setTagValue(DATA_TYPE_MISMATCH_ERROR, true) e.failAnalysis( - s"cannot resolve '${e.sql}' due to data type mismatch: $message") + s"cannot resolve '${e.sql}' due to data type mismatch: $message" + + extraHintForAnsiTypeCoercionExpression(operator)) } case c: Cast if !c.resolved => @@ -424,27 +422,20 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { |the ${ordinalNumber(ti + 1)} table has ${child.output.length} columns """.stripMargin.replace("\n", " ").trim()) } - val isUnion = operator.isInstanceOf[Union] - val dataTypesAreCompatibleFn = if (isUnion) { - (dt1: DataType, dt2: DataType) => - !DataType.equalsStructurally(dt1, dt2, true) - } else { - // SPARK-18058: we shall not care about the nullability of columns - (dt1: DataType, dt2: DataType) => - TypeCoercion.findWiderTypeForTwo(dt1.asNullable, dt2.asNullable).isEmpty - } + val dataTypesAreCompatibleFn = getDataTypesAreCompatibleFn(operator) // Check if the data types match. dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), ci) => // SPARK-18058: we shall not care about the nullability of columns if (dataTypesAreCompatibleFn(dt1, dt2)) { - failAnalysis( + val errorMessage = s""" |${operator.nodeName} can only be performed on tables with the compatible |column types. The ${ordinalNumber(ci)} column of the |${ordinalNumber(ti + 1)} table is ${dt1.catalogString} type which is not |compatible with ${dt2.catalogString} at same column of first table - """.stripMargin.replace("\n", " ").trim()) + """.stripMargin.replace("\n", " ").trim() + failAnalysis(errorMessage + extraHintForAnsiTypeCoercionPlan(operator)) } } } @@ -593,6 +584,86 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { plan.setAnalyzed() } + private def getAllExpressions(plan: LogicalPlan): Seq[Expression] = { + plan match { + // `groupingExpressions` may rely on `aggregateExpressions`, due to the GROUP BY alias + // feature. We should check errors in `aggregateExpressions` first. + case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions + case _ => plan.expressions + } + } + + private def getDataTypesAreCompatibleFn(plan: LogicalPlan): (DataType, DataType) => Boolean = { + val isUnion = plan.isInstanceOf[Union] + if (isUnion) { + (dt1: DataType, dt2: DataType) => + !DataType.equalsStructurally(dt1, dt2, true) + } else { + // SPARK-18058: we shall not care about the nullability of columns + (dt1: DataType, dt2: DataType) => + TypeCoercion.findWiderTypeForTwo(dt1.asNullable, dt2.asNullable).isEmpty + } + } + + private def getDefaultTypeCoercionPlan(plan: LogicalPlan): LogicalPlan = + TypeCoercion.typeCoercionRules.foldLeft(plan) { case (p, rule) => rule(p) } + + private def extraHintMessage(issueFixedIfAnsiOff: Boolean): String = { + if (issueFixedIfAnsiOff) { + "\nTo fix the error, you might need to add explicit type casts. If necessary set " + + s"${SQLConf.ANSI_ENABLED.key} to false to bypass this error." + } else { + "" + } + } + + private def extraHintForAnsiTypeCoercionExpression(plan: LogicalPlan): String = { + if (!SQLConf.get.ansiEnabled) { + "" + } else { + val nonAnsiPlan = getDefaultTypeCoercionPlan(plan) + var issueFixedIfAnsiOff = true + getAllExpressions(nonAnsiPlan).foreach(_.foreachUp { + case e: Expression if e.getTagValue(DATA_TYPE_MISMATCH_ERROR).contains(true) && + e.checkInputDataTypes().isFailure => + e.checkInputDataTypes() match { + case TypeCheckResult.TypeCheckFailure(_) => + issueFixedIfAnsiOff = false + } + + case _ => + }) + extraHintMessage(issueFixedIfAnsiOff) + } + } + + private def extraHintForAnsiTypeCoercionPlan(plan: LogicalPlan): String = { + if (!SQLConf.get.ansiEnabled) { + "" + } else { + val nonAnsiPlan = getDefaultTypeCoercionPlan(plan) + var issueFixedIfAnsiOff = true + nonAnsiPlan match { + case _: Union | _: SetOperation if nonAnsiPlan.children.length > 1 => + def dataTypes(plan: LogicalPlan): Seq[DataType] = plan.output.map(_.dataType) + + val ref = dataTypes(nonAnsiPlan.children.head) + val dataTypesAreCompatibleFn = getDataTypesAreCompatibleFn(nonAnsiPlan) + nonAnsiPlan.children.tail.zipWithIndex.foreach { case (child, ti) => + // Check if the data types match. + dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), ci) => + if (dataTypesAreCompatibleFn(dt1, dt2)) { + issueFixedIfAnsiOff = false + } + } + } + + case _ => + } + extraHintMessage(issueFixedIfAnsiOff) + } + } + /** * Validates subquery expressions in the plan. Upon failure, returns an user facing error. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 5066674..82fba93 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -170,7 +170,7 @@ abstract class TypeCoercionBase { * Type coercion rule that combines multiple type coercion rules and applies them in a single tree * traversal. */ - case class CombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends TypeCoercionRule { + class CombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends TypeCoercionRule { override def transform: PartialFunction[Expression, Expression] = { val transforms = rules.map(_.transform) Function.unlift { e: Expression => @@ -795,7 +795,7 @@ object TypeCoercion extends TypeCoercionBase { override def typeCoercionRules: List[Rule[LogicalPlan]] = WidenSetOperationTypes :: - CombinedTypeCoercionRule( + new CombinedTypeCoercionRule( InConversion :: PromoteStrings :: DecimalPrecision :: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala index 5ec303d..4face49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala @@ -76,6 +76,7 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder" :: "org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution" :: + "org.apache.spark.sql.catalyst.analysis.AnsiTypeCoercion$AnsiCombinedTypeCoercionRule" :: "org.apache.spark.sql.catalyst.analysis.ApplyCharTypePadding" :: "org.apache.spark.sql.catalyst.analysis.DeduplicateRelations" :: "org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases" :: diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out index b95c8da..c3c0977 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out @@ -230,7 +230,8 @@ select next_day(timestamp_ntz"2015-07-23 12:12:12", "Mon") struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'next_day(TIMESTAMP_NTZ '2015-07-23 12:12:12', 'Mon')' due to data type mismatch: argument 1 requires date type, however, 'TIMESTAMP_NTZ '2015-07-23 12:12:12'' is of timestamp_ntz type.; line 1 pos 7 +cannot resolve 'next_day(TIMESTAMP_NTZ '2015-07-23 12:12:12', 'Mon')' due to data type mismatch: argument 1 requires date type, however, 'TIMESTAMP_NTZ '2015-07-23 12:12:12'' is of timestamp_ntz type. +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7 -- !query @@ -498,7 +499,8 @@ select date_add(date_str, 1) from date_view struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'date_add(date_view.date_str, 1)' due to data type mismatch: argument 1 requires date type, however, 'date_view.date_str' is of string type.; line 1 pos 7 +cannot resolve 'date_add(date_view.date_str, 1)' due to data type mismatch: argument 1 requires date type, however, 'date_view.date_str' is of string type. +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7 -- !query @@ -507,7 +509,8 @@ select date_sub(date_str, 1) from date_view struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'date_sub(date_view.date_str, 1)' due to data type mismatch: argument 1 requires date type, however, 'date_view.date_str' is of string type.; line 1 pos 7 +cannot resolve 'date_sub(date_view.date_str, 1)' due to data type mismatch: argument 1 requires date type, however, 'date_view.date_str' is of string type. +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7 -- !query @@ -589,7 +592,8 @@ select date_str - date '2001-09-28' from date_view struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(date_view.date_str - DATE '2001-09-28')' due to data type mismatch: argument 1 requires date type, however, 'date_view.date_str' is of string type.; line 1 pos 7 +cannot resolve '(date_view.date_str - DATE '2001-09-28')' due to data type mismatch: argument 1 requires date type, however, 'date_view.date_str' is of string type. +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index e9c3232..230393f 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1533,7 +1533,8 @@ select str - interval '4 22:12' day to minute from interval_view struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'interval_view.str + (- INTERVAL '4 22:12' DAY TO MINUTE)' due to data type mismatch: argument 1 requires (timestamp or timestamp without time zone) type, however, 'interval_view.str' is of string type.; line 1 pos 7 +cannot resolve 'interval_view.str + (- INTERVAL '4 22:12' DAY TO MINUTE)' due to data type mismatch: argument 1 requires (timestamp or timestamp without time zone) type, however, 'interval_view.str' is of string type. +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7 -- !query @@ -1542,7 +1543,8 @@ select str + interval '4 22:12' day to minute from interval_view struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'interval_view.str + INTERVAL '4 22:12' DAY TO MINUTE' due to data type mismatch: argument 1 requires (timestamp or timestamp without time zone) type, however, 'interval_view.str' is of string type.; line 1 pos 7 +cannot resolve 'interval_view.str + INTERVAL '4 22:12' DAY TO MINUTE' due to data type mismatch: argument 1 requires (timestamp or timestamp without time zone) type, however, 'interval_view.str' is of string type. +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out index 13f3fe0..84dcf3a 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out @@ -686,6 +686,7 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. The first column of the second table is string type which is not compatible with decimal(38,18) at same column of first table +To fix the error, you might need to add explicit type casts. If necessary set spark.sql.ansi.enabled to false to bypass this error. -- !query --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org