[spark] branch master updated: [SPARK-37490][SQL] Show extra hint if analyzer fails due to ANSI type coercion

gengliang Tue, 30 Nov 2021 20:46:31 -0800

This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new d61c2f4  [SPARK-37490][SQL] Show extra hint if analyzer fails due to 
ANSI type coercion
d61c2f4 is described below

commit d61c2f45c3c1fa90aef7f7aff0d9f292edfd3083
Author: Gengliang Wang <gengli...@apache.org>
AuthorDate: Wed Dec 1 12:45:04 2021 +0800

    [SPARK-37490][SQL] Show extra hint if analyzer fails due to ANSI type 
coercion
    
    ### What changes were proposed in this pull request?
    
    Show extra hint in the error message if analysis failed only with ANSI type 
coercion:
    ```
    To fix the error, you might need to add explicit type casts. If necessary 
set spark.sql.ansi.enabled to false to bypass this error.
    ```
    ### Why are the changes needed?
    
    Improve error message
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, Spark will show extra hint if analyzer fails due to ANSI type coercion
    
    ### How was this patch tested?
    
    Unit tests
    
    Closes #34747 from gengliangwang/improveCoercionMsg.
    
    Authored-by: Gengliang Wang <gengli...@apache.org>
    Signed-off-by: Gengliang Wang <gengli...@apache.org>
---
 .../sql/catalyst/analysis/AnsiTypeCoercion.scala   |   7 +-
 .../sql/catalyst/analysis/CheckAnalysis.scala      | 111 +++++++++++++++++----
 .../spark/sql/catalyst/analysis/TypeCoercion.scala |   4 +-
 .../sql/catalyst/rules/RuleIdCollection.scala      |   1 +
 .../resources/sql-tests/results/ansi/date.sql.out  |  12 ++-
 .../sql-tests/results/ansi/interval.sql.out        |   6 +-
 .../sql-tests/results/postgreSQL/union.sql.out     |   1 +
 7 files changed, 113 insertions(+), 29 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala
index debc13b..267c2cc 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala
@@ -75,7 +75,7 @@ import org.apache.spark.sql.types._
 object AnsiTypeCoercion extends TypeCoercionBase {
   override def typeCoercionRules: List[Rule[LogicalPlan]] =
     WidenSetOperationTypes ::
-    CombinedTypeCoercionRule(
+    new AnsiCombinedTypeCoercionRule(
       InConversion ::
       PromoteStringLiterals ::
       DecimalPrecision ::
@@ -304,4 +304,9 @@ object AnsiTypeCoercion extends TypeCoercionBase {
         s.copy(left = newLeft, right = newRight)
     }
   }
+
+  // This is for generating a new rule id, so that we can run both default and 
Ansi
+  // type coercion rules against one logical plan.
+  class AnsiCombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends
+    CombinedTypeCoercionRule(rules)
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 5bf37a2..491d525 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -25,6 +25,7 @@ import 
org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, StringUtils, 
TypeUtils}
 import org.apache.spark.sql.connector.catalog.{LookupCatalog, 
SupportsPartitionManagement}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, 
QueryExecutionErrors}
@@ -47,6 +48,8 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
    */
   val extendedCheckRules: Seq[LogicalPlan => Unit] = Nil
 
+  val DATA_TYPE_MISMATCH_ERROR = TreeNodeTag[Boolean]("dataTypeMismatchError")
+
   protected def failAnalysis(msg: String): Nothing = {
     throw new AnalysisException(msg)
   }
@@ -165,14 +168,7 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
             }
         }
 
-        val exprs = operator match {
-          // `groupingExpressions` may rely on `aggregateExpressions`, due to 
the GROUP BY alias
-          // feature. We should check errors in `aggregateExpressions` first.
-          case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions
-          case _ => operator.expressions
-        }
-
-        exprs.foreach(_.foreachUp {
+        getAllExpressions(operator).foreach(_.foreachUp {
           case a: Attribute if !a.resolved =>
             val missingCol = a.sql
             val candidates = operator.inputSet.toSeq.map(_.qualifiedName)
@@ -189,8 +185,10 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
           case e: Expression if e.checkInputDataTypes().isFailure =>
             e.checkInputDataTypes() match {
               case TypeCheckResult.TypeCheckFailure(message) =>
+                e.setTagValue(DATA_TYPE_MISMATCH_ERROR, true)
                 e.failAnalysis(
-                  s"cannot resolve '${e.sql}' due to data type mismatch: 
$message")
+                  s"cannot resolve '${e.sql}' due to data type mismatch: 
$message" +
+                    extraHintForAnsiTypeCoercionExpression(operator))
             }
 
           case c: Cast if !c.resolved =>
@@ -424,27 +422,20 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
                     |the ${ordinalNumber(ti + 1)} table has 
${child.output.length} columns
                   """.stripMargin.replace("\n", " ").trim())
               }
-              val isUnion = operator.isInstanceOf[Union]
-              val dataTypesAreCompatibleFn = if (isUnion) {
-                (dt1: DataType, dt2: DataType) =>
-                  !DataType.equalsStructurally(dt1, dt2, true)
-              } else {
-                // SPARK-18058: we shall not care about the nullability of 
columns
-                (dt1: DataType, dt2: DataType) =>
-                  TypeCoercion.findWiderTypeForTwo(dt1.asNullable, 
dt2.asNullable).isEmpty
-              }
 
+              val dataTypesAreCompatibleFn = 
getDataTypesAreCompatibleFn(operator)
               // Check if the data types match.
               dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, 
dt2), ci) =>
                 // SPARK-18058: we shall not care about the nullability of 
columns
                 if (dataTypesAreCompatibleFn(dt1, dt2)) {
-                  failAnalysis(
+                  val errorMessage =
                     s"""
                        |${operator.nodeName} can only be performed on tables 
with the compatible
                        |column types. The ${ordinalNumber(ci)} column of the
                        |${ordinalNumber(ti + 1)} table is ${dt1.catalogString} 
type which is not
                        |compatible with ${dt2.catalogString} at same column of 
first table
-                    """.stripMargin.replace("\n", " ").trim())
+                    """.stripMargin.replace("\n", " ").trim()
+                  failAnalysis(errorMessage + 
extraHintForAnsiTypeCoercionPlan(operator))
                 }
               }
             }
@@ -593,6 +584,86 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
     plan.setAnalyzed()
   }
 
+  private def getAllExpressions(plan: LogicalPlan): Seq[Expression] = {
+    plan match {
+      // `groupingExpressions` may rely on `aggregateExpressions`, due to the 
GROUP BY alias
+      // feature. We should check errors in `aggregateExpressions` first.
+      case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions
+      case _ => plan.expressions
+    }
+  }
+
+  private def getDataTypesAreCompatibleFn(plan: LogicalPlan): (DataType, 
DataType) => Boolean = {
+    val isUnion = plan.isInstanceOf[Union]
+    if (isUnion) {
+      (dt1: DataType, dt2: DataType) =>
+        !DataType.equalsStructurally(dt1, dt2, true)
+    } else {
+      // SPARK-18058: we shall not care about the nullability of columns
+      (dt1: DataType, dt2: DataType) =>
+        TypeCoercion.findWiderTypeForTwo(dt1.asNullable, 
dt2.asNullable).isEmpty
+    }
+  }
+
+  private def getDefaultTypeCoercionPlan(plan: LogicalPlan): LogicalPlan =
+    TypeCoercion.typeCoercionRules.foldLeft(plan) { case (p, rule) => rule(p) }
+
+  private def extraHintMessage(issueFixedIfAnsiOff: Boolean): String = {
+    if (issueFixedIfAnsiOff) {
+      "\nTo fix the error, you might need to add explicit type casts. If 
necessary set " +
+        s"${SQLConf.ANSI_ENABLED.key} to false to bypass this error."
+    } else {
+      ""
+    }
+  }
+
+  private def extraHintForAnsiTypeCoercionExpression(plan: LogicalPlan): 
String = {
+    if (!SQLConf.get.ansiEnabled) {
+      ""
+    } else {
+      val nonAnsiPlan = getDefaultTypeCoercionPlan(plan)
+      var issueFixedIfAnsiOff = true
+      getAllExpressions(nonAnsiPlan).foreach(_.foreachUp {
+        case e: Expression if 
e.getTagValue(DATA_TYPE_MISMATCH_ERROR).contains(true) &&
+            e.checkInputDataTypes().isFailure =>
+          e.checkInputDataTypes() match {
+            case TypeCheckResult.TypeCheckFailure(_) =>
+              issueFixedIfAnsiOff = false
+          }
+
+        case _ =>
+      })
+      extraHintMessage(issueFixedIfAnsiOff)
+    }
+  }
+
+  private def extraHintForAnsiTypeCoercionPlan(plan: LogicalPlan): String = {
+    if (!SQLConf.get.ansiEnabled) {
+      ""
+    } else {
+      val nonAnsiPlan = getDefaultTypeCoercionPlan(plan)
+      var issueFixedIfAnsiOff = true
+      nonAnsiPlan match {
+        case _: Union | _: SetOperation if nonAnsiPlan.children.length > 1 =>
+          def dataTypes(plan: LogicalPlan): Seq[DataType] = 
plan.output.map(_.dataType)
+
+          val ref = dataTypes(nonAnsiPlan.children.head)
+          val dataTypesAreCompatibleFn = 
getDataTypesAreCompatibleFn(nonAnsiPlan)
+          nonAnsiPlan.children.tail.zipWithIndex.foreach { case (child, ti) =>
+            // Check if the data types match.
+            dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), 
ci) =>
+              if (dataTypesAreCompatibleFn(dt1, dt2)) {
+                issueFixedIfAnsiOff = false
+              }
+            }
+          }
+
+        case _ =>
+      }
+      extraHintMessage(issueFixedIfAnsiOff)
+    }
+  }
+
   /**
    * Validates subquery expressions in the plan. Upon failure, returns an user 
facing error.
    */
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 5066674..82fba93 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -170,7 +170,7 @@ abstract class TypeCoercionBase {
    * Type coercion rule that combines multiple type coercion rules and applies 
them in a single tree
    * traversal.
    */
-  case class CombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends 
TypeCoercionRule {
+  class CombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends 
TypeCoercionRule {
     override def transform: PartialFunction[Expression, Expression] = {
       val transforms = rules.map(_.transform)
       Function.unlift { e: Expression =>
@@ -795,7 +795,7 @@ object TypeCoercion extends TypeCoercionBase {
 
   override def typeCoercionRules: List[Rule[LogicalPlan]] =
     WidenSetOperationTypes ::
-    CombinedTypeCoercionRule(
+    new CombinedTypeCoercionRule(
       InConversion ::
       PromoteStrings ::
       DecimalPrecision ::
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
index 5ec303d..4face49 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
@@ -76,6 +76,7 @@ object RuleIdCollection {
       "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame" ::
       "org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder" ::
       "org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution" ::
+      
"org.apache.spark.sql.catalyst.analysis.AnsiTypeCoercion$AnsiCombinedTypeCoercionRule"
 ::
       "org.apache.spark.sql.catalyst.analysis.ApplyCharTypePadding" ::
       "org.apache.spark.sql.catalyst.analysis.DeduplicateRelations" ::
       "org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases" ::
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out 
b/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out
index b95c8da..c3c0977 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/date.sql.out
@@ -230,7 +230,8 @@ select next_day(timestamp_ntz"2015-07-23 12:12:12", "Mon")
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'next_day(TIMESTAMP_NTZ '2015-07-23 12:12:12', 'Mon')' due to 
data type mismatch: argument 1 requires date type, however, 'TIMESTAMP_NTZ 
'2015-07-23 12:12:12'' is of timestamp_ntz type.; line 1 pos 7
+cannot resolve 'next_day(TIMESTAMP_NTZ '2015-07-23 12:12:12', 'Mon')' due to 
data type mismatch: argument 1 requires date type, however, 'TIMESTAMP_NTZ 
'2015-07-23 12:12:12'' is of timestamp_ntz type.
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7
 
 
 -- !query
@@ -498,7 +499,8 @@ select date_add(date_str, 1) from date_view
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'date_add(date_view.date_str, 1)' due to data type mismatch: 
argument 1 requires date type, however, 'date_view.date_str' is of string 
type.; line 1 pos 7
+cannot resolve 'date_add(date_view.date_str, 1)' due to data type mismatch: 
argument 1 requires date type, however, 'date_view.date_str' is of string type.
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7
 
 
 -- !query
@@ -507,7 +509,8 @@ select date_sub(date_str, 1) from date_view
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'date_sub(date_view.date_str, 1)' due to data type mismatch: 
argument 1 requires date type, however, 'date_view.date_str' is of string 
type.; line 1 pos 7
+cannot resolve 'date_sub(date_view.date_str, 1)' due to data type mismatch: 
argument 1 requires date type, however, 'date_view.date_str' is of string type.
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7
 
 
 -- !query
@@ -589,7 +592,8 @@ select date_str - date '2001-09-28' from date_view
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(date_view.date_str - DATE '2001-09-28')' due to data type 
mismatch: argument 1 requires date type, however, 'date_view.date_str' is of 
string type.; line 1 pos 7
+cannot resolve '(date_view.date_str - DATE '2001-09-28')' due to data type 
mismatch: argument 1 requires date type, however, 'date_view.date_str' is of 
string type.
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7
 
 
 -- !query
diff --git 
a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out 
b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
index e9c3232..230393f 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
@@ -1533,7 +1533,8 @@ select str - interval '4 22:12' day to minute from 
interval_view
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'interval_view.str + (- INTERVAL '4 22:12' DAY TO MINUTE)' due 
to data type mismatch: argument 1 requires (timestamp or timestamp without time 
zone) type, however, 'interval_view.str' is of string type.; line 1 pos 7
+cannot resolve 'interval_view.str + (- INTERVAL '4 22:12' DAY TO MINUTE)' due 
to data type mismatch: argument 1 requires (timestamp or timestamp without time 
zone) type, however, 'interval_view.str' is of string type.
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7
 
 
 -- !query
@@ -1542,7 +1543,8 @@ select str + interval '4 22:12' day to minute from 
interval_view
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'interval_view.str + INTERVAL '4 22:12' DAY TO MINUTE' due to 
data type mismatch: argument 1 requires (timestamp or timestamp without time 
zone) type, however, 'interval_view.str' is of string type.; line 1 pos 7
+cannot resolve 'interval_view.str + INTERVAL '4 22:12' DAY TO MINUTE' due to 
data type mismatch: argument 1 requires (timestamp or timestamp without time 
zone) type, however, 'interval_view.str' is of string type.
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.; line 1 pos 7
 
 
 -- !query
diff --git 
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out 
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out
index 13f3fe0..84dcf3a 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out
@@ -686,6 +686,7 @@ struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
 Union can only be performed on tables with the compatible column types. The 
first column of the second table is string type which is not compatible with 
decimal(38,18) at same column of first table
+To fix the error, you might need to add explicit type casts. If necessary set 
spark.sql.ansi.enabled to false to bypass this error.
 
 
 -- !query

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-37490][SQL] Show extra hint if analyzer fails due to ANSI type coercion

Reply via email to