This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5c963e37fa5 [SPARK-40761][SQL] Migrate type check failures of percentile expressions onto error classes 5c963e37fa5 is described below commit 5c963e37fa5eeac4462d4e29e4322a1564d10370 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Sat Oct 15 09:42:19 2022 +0300 [SPARK-40761][SQL] Migrate type check failures of percentile expressions onto error classes ### What changes were proposed in this pull request? This pr replace `TypeCheckFailure` by `DataTypeMismatch` in type checks in the percentile expressions, includes `ApproximatePercentile.scala` and `percentiles.scala` ### Why are the changes needed? Migration onto error classes unifies Spark SQL error messages. ### Does this PR introduce _any_ user-facing change? Yes. The PR changes user-facing error messages. ### How was this patch tested? - Pass GitHub Actions Closes #38234 from LuciferYang/SPARK-40761-2. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- core/src/main/resources/error/error-classes.json | 12 +++- .../aggregate/ApproximatePercentile.scala | 48 +++++++++++--- .../expressions/aggregate/percentiles.scala | 27 ++++++-- .../sql/catalyst/expressions/csvExpressions.scala | 11 +++- .../sql/catalyst/expressions/jsonExpressions.scala | 11 +++- .../aggregate/ApproximatePercentileSuite.scala | 73 +++++++++++++++++----- .../expressions/aggregate/PercentileSuite.scala | 42 +++++++++++-- .../sql-tests/results/csv-functions.sql.out | 6 +- .../sql-tests/results/json-functions.sql.out | 6 +- 9 files changed, 192 insertions(+), 44 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 16b08583e71..8462fe2a42b 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -170,7 +170,7 @@ }, "NON_FOLDABLE_INPUT" : { "message" : [ - "the input should be a foldable string expression and not null; however, got <inputExpr>." + "the input <inputName> should be a foldable <inputType> expression; however, got <inputExpr>." ] }, "NON_STRING_TYPE" : { @@ -228,11 +228,21 @@ "parameter <paramIndex> requires <requiredType> type, however, <inputSql> is of <inputType> type." ] }, + "UNEXPECTED_NULL" : { + "message" : [ + "The <exprName> must not be null" + ] + }, "UNSPECIFIED_FRAME" : { "message" : [ "Cannot use an UnspecifiedFrame. This should have been converted during analysis." ] }, + "VALUE_OUT_OF_RANGE" : { + "message" : [ + "The <exprName> must be between <valueRange> (current value = <currentValue>)" + ] + }, "WRONG_NUM_PARAMS" : { "message" : [ "wrong number of parameters: <actualNum>." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index d8eccc075a2..1499f358ac4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -23,8 +23,9 @@ import com.google.common.primitives.{Doubles, Ints, Longs} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult} -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile.PercentileDigest import org.apache.spark.sql.catalyst.trees.TernaryLike import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData} @@ -118,17 +119,46 @@ case class ApproximatePercentile( val defaultCheck = super.checkInputDataTypes() if (defaultCheck.isFailure) { defaultCheck - } else if (!percentageExpression.foldable || !accuracyExpression.foldable) { - TypeCheckFailure(s"The accuracy or percentage provided must be a constant literal") + } else if (!percentageExpression.foldable) { + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "percentage", + "inputType" -> toSQLType(percentageExpression.dataType), + "inputExpr" -> toSQLExpr(percentageExpression) + ) + ) + } else if (!accuracyExpression.foldable) { + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "accuracy", + "inputType" -> toSQLType(accuracyExpression.dataType), + "inputExpr" -> toSQLExpr(accuracyExpression) + ) + ) } else if (accuracy <= 0 || accuracy > Int.MaxValue) { - TypeCheckFailure(s"The accuracy provided must be a literal between (0, ${Int.MaxValue}]" + - s" (current value = $accuracy)") + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "accuracy", + "valueRange" -> s"(0, ${Int.MaxValue}]", + "currentValue" -> toSQLValue(accuracy, LongType) + ) + ) } else if (percentages == null) { - TypeCheckFailure("Percentage value must not be null") + DataTypeMismatch( + errorSubClass = "UNEXPECTED_NULL", + messageParameters = Map("exprName" -> "percentage")) } else if (percentages.exists(percentage => percentage < 0.0D || percentage > 1.0D)) { - TypeCheckFailure( - s"All percentage values must be between 0.0 and 1.0 " + - s"(current = ${percentages.mkString(", ")})") + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "percentage", + "valueRange" -> "[0.0, 1.0]", + "currentValue" -> percentages.map(toSQLValue(_, DoubleType)).mkString(",") + ) + ) } else { TypeCheckSuccess } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala index e9e4d073d65..81bc7e51499 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala @@ -21,8 +21,9 @@ import java.util import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.trees.{BinaryLike, TernaryLike, UnaryLike} import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.QueryExecutionErrors @@ -84,14 +85,28 @@ abstract class PercentileBase defaultCheck } else if (!percentageExpression.foldable) { // percentageExpression must be foldable - TypeCheckFailure("The percentage(s) must be a constant literal, " + - s"but got $percentageExpression") + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "percentage", + "inputType" -> toSQLType(percentageExpression.dataType), + "inputExpr" -> toSQLExpr(percentageExpression)) + ) } else if (percentages == null) { - TypeCheckFailure("Percentage value must not be null") + DataTypeMismatch( + errorSubClass = "UNEXPECTED_NULL", + messageParameters = Map("exprName" -> "percentage") + ) } else if (percentages.exists(percentage => percentage < 0.0 || percentage > 1.0)) { // percentages(s) must be in the range [0.0, 1.0] - TypeCheckFailure("Percentage(s) must be between 0.0 and 1.0, " + - s"but got $percentageExpression") + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "percentage", + "valueRange" -> "[0.0, 1.0]", + "currentValue" -> percentages.map(toSQLValue(_, DoubleType)).mkString(",") + ) + ) } else { TypeCheckSuccess } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index 8965e30e4e9..e47cf493d4c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -185,10 +185,17 @@ case class SchemaOfCsv( override def checkInputDataTypes(): TypeCheckResult = { if (child.foldable && csv != null) { super.checkInputDataTypes() - } else { + } else if (!child.foldable) { DataTypeMismatch( errorSubClass = "NON_FOLDABLE_INPUT", - messageParameters = Map("inputExpr" -> toSQLExpr(child))) + messageParameters = Map( + "inputName" -> "csv", + "inputType" -> toSQLType(child.dataType), + "inputExpr" -> toSQLExpr(child))) + } else { + DataTypeMismatch( + errorSubClass = "UNEXPECTED_NULL", + messageParameters = Map("exprName" -> "csv")) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 12da5933ece..9087d4a731e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -794,10 +794,17 @@ case class SchemaOfJson( override def checkInputDataTypes(): TypeCheckResult = { if (child.foldable && json != null) { super.checkInputDataTypes() - } else { + } else if (!child.foldable) { DataTypeMismatch( errorSubClass = "NON_FOLDABLE_INPUT", - messageParameters = Map("inputExpr" -> toSQLExpr(child))) + messageParameters = Map( + "inputName" -> "json", + "inputType" -> toSQLType(child.dataType), + "inputExpr" -> toSQLExpr(child))) + } else { + DataTypeMismatch( + errorSubClass = "UNEXPECTED_NULL", + messageParameters = Map("exprName" -> "json")) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala index 351d240cc8a..533f60da9b4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala @@ -21,16 +21,17 @@ import java.sql.Date import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, BoundReference, Cast, CreateArray, DecimalLiteral, GenericInternalRow, Literal} +import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile.{PercentileDigest, PercentileDigestSerializer} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.catalyst.util.{ArrayData, QuantileSummaries} import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats -import org.apache.spark.sql.types.{ArrayType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, IntegralType} +import org.apache.spark.sql.types.{ArrayType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, IntegralType, LongType} import org.apache.spark.util.SizeEstimator class ApproximatePercentileSuite extends SparkFunSuite { @@ -212,14 +213,22 @@ class ApproximatePercentileSuite extends SparkFunSuite { test("class ApproximatePercentile, fails analysis if percentage or accuracy is not a constant") { val attribute = AttributeReference("a", DoubleType)() + val accuracyExpression = AttributeReference("b", IntegerType)() val wrongAccuracy = new ApproximatePercentile( attribute, percentageExpression = Literal(0.5D), - accuracyExpression = AttributeReference("b", IntegerType)()) + accuracyExpression = accuracyExpression) assertEqual( wrongAccuracy.checkInputDataTypes(), - TypeCheckFailure("The accuracy or percentage provided must be a constant literal") + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "accuracy", + "inputType" -> toSQLType(accuracyExpression.dataType), + "inputExpr" -> toSQLExpr(accuracyExpression) + ) + ) ) val wrongPercentage = new ApproximatePercentile( @@ -229,19 +238,34 @@ class ApproximatePercentileSuite extends SparkFunSuite { assertEqual( wrongPercentage.checkInputDataTypes(), - TypeCheckFailure("The accuracy or percentage provided must be a constant literal") + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "percentage", + "inputType" -> toSQLType(attribute.dataType), + "inputExpr" -> toSQLExpr(attribute) + ) + ) ) } test("class ApproximatePercentile, fails analysis if parameters are invalid") { + val wrongAccuracyExpression = Literal(-1) val wrongAccuracy = new ApproximatePercentile( AttributeReference("a", DoubleType)(), percentageExpression = Literal(0.5D), - accuracyExpression = Literal(-1)) + accuracyExpression = wrongAccuracyExpression) assertEqual( wrongAccuracy.checkInputDataTypes(), - TypeCheckFailure(s"The accuracy provided must be a literal between (0, ${Int.MaxValue}]" + - " (current value = -1)")) + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "accuracy", + "valueRange" -> s"(0, ${Int.MaxValue}]", + "currentValue" -> + toSQLValue(wrongAccuracyExpression.eval().asInstanceOf[Number].longValue, LongType)) + ) + ) val correctPercentageExpressions = Seq( Literal(0.1f, FloatType), @@ -273,11 +297,32 @@ class ApproximatePercentileSuite extends SparkFunSuite { percentageExpression = percentageExpression, accuracyExpression = Literal(100)) - assert( - wrongPercentage.checkInputDataTypes() match { - case TypeCheckFailure(msg) if msg.contains("must be between 0.0 and 1.0") => true - case _ => false - }) + percentageExpression.eval() match { + case array: ArrayData => + assertEqual(wrongPercentage.checkInputDataTypes(), + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "percentage", + "valueRange" -> "[0.0, 1.0]", + "currentValue" -> + array.toDoubleArray().map(toSQLValue(_, DoubleType)).mkString(",") + ) + ) + ) + case other => + assertEqual(wrongPercentage.checkInputDataTypes(), + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "percentage", + "valueRange" -> "[0.0, 1.0]", + "currentValue" -> + Array(other).map(toSQLValue(_, DoubleType)).mkString(",") + ) + ) + ) + } } } @@ -320,7 +365,7 @@ class ApproximatePercentileSuite extends SparkFunSuite { assert(new ApproximatePercentile( AttributeReference("a", DoubleType)(), percentageExpression = Literal(null, DoubleType)).checkInputDataTypes() === - TypeCheckFailure("Percentage value must not be null")) + DataTypeMismatch(errorSubClass = "UNEXPECTED_NULL", Map("exprName" -> "percentage"))) val nullPercentageExprs = Seq(CreateArray(Seq(null).map(Literal(_))), CreateArray(Seq(0.1D, null).map(Literal(_)))) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala index 61ccaefb270..622f7ad548e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult._ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ @@ -208,9 +209,32 @@ class PercentileSuite extends SparkFunSuite { invalidPercentages.foreach { percentage => val percentile2 = new Percentile(child, percentage) - assertEqual(percentile2.checkInputDataTypes(), - TypeCheckFailure(s"Percentage(s) must be between 0.0 and 1.0, " + - s"but got ${percentage.simpleString(100)}")) + percentage.eval() match { + case array: ArrayData => + assertEqual(percentile2.checkInputDataTypes(), + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "percentage", + "valueRange" -> "[0.0, 1.0]", + "currentValue" -> + array.toDoubleArray().map(toSQLValue(_, DoubleType)).mkString(",") + ) + ) + ) + case other => + assertEqual(percentile2.checkInputDataTypes(), + DataTypeMismatch( + errorSubClass = "VALUE_OUT_OF_RANGE", + messageParameters = Map( + "exprName" -> "percentage", + "valueRange" -> "[0.0, 1.0]", + "currentValue" -> + Array(other).map(toSQLValue(_, DoubleType)).mkString(",") + ) + ) + ) + } } val nonFoldablePercentage = Seq(NonFoldableLiteral(0.5), @@ -219,8 +243,14 @@ class PercentileSuite extends SparkFunSuite { nonFoldablePercentage.foreach { percentage => val percentile3 = new Percentile(child, percentage) assertEqual(percentile3.checkInputDataTypes(), - TypeCheckFailure(s"The percentage(s) must be a constant literal, " + - s"but got ${percentage}")) + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> "percentage", + "inputType" -> toSQLType(percentage.dataType), + "inputExpr" -> toSQLExpr(percentage)) + ) + ) } val invalidDataTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType, @@ -261,7 +291,7 @@ class PercentileSuite extends SparkFunSuite { assert(new Percentile( AttributeReference("a", DoubleType)(), percentageExpression = Literal(null, DoubleType)).checkInputDataTypes() === - TypeCheckFailure("Percentage value must not be null")) + DataTypeMismatch(errorSubClass = "UNEXPECTED_NULL", Map("exprName" -> "percentage"))) val nullPercentageExprs = Seq(CreateArray(Seq(null).map(Literal(_))), CreateArray(Seq(0.1D, null).map(Literal(_)))) diff --git a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out index cbc9d839a41..d76f20ee835 100644 --- a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out @@ -110,9 +110,9 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_NULL", "messageParameters" : { - "inputExpr" : "\"NULL\"", + "exprName" : "csv", "sqlExpr" : "\"schema_of_csv(NULL)\"" }, "queryContext" : [ { @@ -143,6 +143,8 @@ org.apache.spark.sql.AnalysisException "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", "messageParameters" : { "inputExpr" : "\"csvField\"", + "inputName" : "csv", + "inputType" : "\"STRING\"", "sqlExpr" : "\"schema_of_csv(csvField)\"" }, "queryContext" : [ { diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 1ecfe2197ed..96a0fc935c0 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -430,9 +430,9 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_NULL", "messageParameters" : { - "inputExpr" : "\"NULL\"", + "exprName" : "json", "sqlExpr" : "\"schema_of_json(NULL)\"" }, "queryContext" : [ { @@ -463,6 +463,8 @@ org.apache.spark.sql.AnalysisException "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", "messageParameters" : { "inputExpr" : "\"jsonField\"", + "inputName" : "json", + "inputType" : "\"STRING\"", "sqlExpr" : "\"schema_of_json(jsonField)\"" }, "queryContext" : [ { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org