This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 24b29adcf53 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy 24b29adcf53 is described below commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b Author: Max Gekk <max.g...@gmail.com> AuthorDate: Wed Sep 6 10:32:37 2023 +0300 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy ### What changes were proposed in this pull request? In the PR, I propose to check the `accuracy` argument is not a NULL in `ApproximatePercentile`. And if it is, throw an `AnalysisException` with new error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`. ### Why are the changes needed? To fix the issue demonstrated by the example: ```sql $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) FROM VALUES (0), (1), (2), (10) AS tab(col); [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42817 from MaxGekk/fix-internal-error-in-percentile_approx. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../aggregate/ApproximatePercentile.scala | 7 ++++- .../sql/ApproximatePercentileQuerySuite.scala | 31 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 3c3afc1c7e7..5b44c3fa31b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -97,7 +97,8 @@ case class ApproximatePercentile( } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. - private lazy val accuracy: Long = accuracyExpression.eval().asInstanceOf[Number].longValue + private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number] + private lazy val accuracy: Long = accuracyNum.longValue override def inputTypes: Seq[AbstractDataType] = { // Support NumericType, DateType, TimestampType and TimestampNTZType since their internal types @@ -138,6 +139,10 @@ case class ApproximatePercentile( "inputExpr" -> toSQLExpr(accuracyExpression) ) ) + } else if (accuracyNum == null) { + DataTypeMismatch( + errorSubClass = "UNEXPECTED_NULL", + messageParameters = Map("exprName" -> "accuracy")) } else if (accuracy <= 0 || accuracy > Int.MaxValue) { DataTypeMismatch( errorSubClass = "VALUE_OUT_OF_RANGE", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 18e8dd6249b..273e8e08fd7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -339,4 +339,35 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession Row(Period.ofMonths(200).normalized(), null, Duration.ofSeconds(200L))) } } + + test("SPARK-45079: NULL arguments of percentile_approx") { + checkError( + exception = intercept[AnalysisException] { + sql( + """ + |SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) + |FROM VALUES (0), (1), (2), (10) AS tab(col); + |""".stripMargin).collect() + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL", + parameters = Map( + "exprName" -> "accuracy", + "sqlExpr" -> "\"percentile_approx(col, array(0.5, 0.4, 0.1), NULL)\""), + context = ExpectedContext( + "", "", 8, 57, "percentile_approx(col, array(0.5, 0.4, 0.1), NULL)")) + checkError( + exception = intercept[AnalysisException] { + sql( + """ + |SELECT percentile_approx(col, NULL, 100) + |FROM VALUES (0), (1), (2), (10) AS tab(col); + |""".stripMargin).collect() + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL", + parameters = Map( + "exprName" -> "percentage", + "sqlExpr" -> "\"percentile_approx(col, NULL, 100)\""), + context = ExpectedContext( + "", "", 8, 40, "percentile_approx(col, NULL, 100)")) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org