This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new 5250ed65cf2 [SPARK-45079][SQL][3.3] Fix an internal error from `percentile_approx()` on `NULL` accuracy 5250ed65cf2 is described below commit 5250ed65cf2c70e4b456c96c1006b854f56ef1f2 Author: Max Gekk <max.g...@gmail.com> AuthorDate: Wed Sep 6 18:56:14 2023 +0300 [SPARK-45079][SQL][3.3] Fix an internal error from `percentile_approx()` on `NULL` accuracy ### What changes were proposed in this pull request? In the PR, I propose to check the `accuracy` argument is not a NULL in `ApproximatePercentile`. And if it is, throw an `AnalysisException` with new error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`. This is a backport of https://github.com/apache/spark/pull/42817. ### Why are the changes needed? To fix the issue demonstrated by the example: ```sql $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) FROM VALUES (0), (1), (2), (10) AS tab(col); [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Authored-by: Max Gekk <max.gekkgmail.com> (cherry picked from commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b) Closes #42835 from MaxGekk/fix-internal-error-in-percentile_approx-3.3. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../expressions/aggregate/ApproximatePercentile.scala | 5 ++++- .../spark/sql/ApproximatePercentileQuerySuite.scala | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index d8eccc075a2..b816e4a9719 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -95,7 +95,8 @@ case class ApproximatePercentile( } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. - private lazy val accuracy: Long = accuracyExpression.eval().asInstanceOf[Number].longValue + private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number] + private lazy val accuracy: Long = accuracyNum.longValue override def inputTypes: Seq[AbstractDataType] = { // Support NumericType, DateType, TimestampType and TimestampNTZType since their internal types @@ -120,6 +121,8 @@ case class ApproximatePercentile( defaultCheck } else if (!percentageExpression.foldable || !accuracyExpression.foldable) { TypeCheckFailure(s"The accuracy or percentage provided must be a constant literal") + } else if (accuracyNum == null) { + TypeCheckFailure("Accuracy value must not be null") } else if (accuracy <= 0 || accuracy > Int.MaxValue) { TypeCheckFailure(s"The accuracy provided must be a literal between (0, ${Int.MaxValue}]" + s" (current value = $accuracy)") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 9237c9e9486..3fd1592a107 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -337,4 +337,23 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession Row(Period.ofMonths(200).normalized(), null, Duration.ofSeconds(200L))) } } + + test("SPARK-45079: NULL arguments of percentile_approx") { + val e1 = intercept[AnalysisException] { + sql( + """ + |SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) + |FROM VALUES (0), (1), (2), (10) AS tab(col); + |""".stripMargin).collect() + } + assert(e1.getMessage.contains("Accuracy value must not be null")) + val e2 = intercept[AnalysisException] { + sql( + """ + |SELECT percentile_approx(col, NULL, 100) + |FROM VALUES (0), (1), (2), (10) AS tab(col); + |""".stripMargin).collect() + } + assert(e2.getMessage.contains("Percentage value must not be null")) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org