This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ee24847ad10 [SPARK-39427][SQL] Disable ANSI intervals in the 
percentile functions
ee24847ad10 is described below

commit ee24847ad100139628a9bffe45f711bdebaa0170
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Thu Jun 9 15:24:11 2022 +0300

    [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to don't support ANSI intervals by the percentile 
functions, and remove the YearMonthIntervalType and DayTimeIntervalType types 
from the list of input types. I propose to properly support ANSI intervals and 
enable them back after that.
    
    ### Why are the changes needed?
    To don't confuse users by results of the percentile functions when inputs 
are ANSI intervals. At the moment, the functions return DOUBLE (or ARRAY OF 
DAUBLE) type independently from inputs. In the case of ANSI intervals, the 
functions should return ANSI interval too.
    
    ### Does this PR introduce _any_ user-facing change?
    No, since the functions haven't released yet.
    
    ### How was this patch tested?
    By running affected test suites:
    ```
    $ build/sbt "sql/test:testOnly 
org.apache.spark.sql.expressions.ExpressionInfoSuite"
    $ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
    $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite"
    $ build/sbt "test:testOnly *PercentileSuite"
    $ build/sbt "test:testOnly *PercentileQuerySuite"
    ```
    and checked manually that ANSI intervals are not supported as input types:
    ```sql
    spark-sql> SELECT percentile(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), 
(INTERVAL '10' MONTH) AS tab(col);
    Error in query: cannot resolve 'percentile(tab.col, CAST(0.5BD AS DOUBLE), 
1L)' due to data type mismatch: argument 1 requires numeric type, however, 
'tab.col' is of interval month type.; line 1 pos 7;
    ```
    
    Closes #36817 from MaxGekk/percentile-disable-ansi-interval.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../sql/catalyst/expressions/aggregate/percentiles.scala | 15 ++++-----------
 .../catalyst/expressions/aggregate/PercentileSuite.scala |  8 ++++----
 .../org/apache/spark/sql/PercentileQuerySuite.scala      | 16 ++++++++--------
 3 files changed, 16 insertions(+), 23 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
index 3acb2f2cc97..0efe09bca2e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
@@ -67,8 +67,7 @@ abstract class PercentileBase extends 
TypedImperativeAggregate[OpenHashMap[AnyRe
       case _: ArrayType => ArrayType(DoubleType, false)
       case _ => DoubleType
     }
-    Seq(TypeCollection(NumericType, YearMonthIntervalType, 
DayTimeIntervalType),
-      percentageExpType, IntegralType)
+    Seq(NumericType, percentageExpType, IntegralType)
   }
 
   // Check the inputTypes are valid, and the percentageExpression satisfies:
@@ -288,7 +287,7 @@ abstract class PercentileBase extends 
TypedImperativeAggregate[OpenHashMap[AnyRe
   usage =
     """
       _FUNC_(col, percentage [, frequency]) - Returns the exact percentile 
value of numeric
-       or ansi interval column `col` at the given percentage. The value of 
percentage must be
+       column `col` at the given percentage. The value of percentage must be
        between 0.0 and 1.0. The value of frequency should be positive integral
 
       _FUNC_(col, array(percentage1 [, percentage2]...) [, frequency]) - 
Returns the exact
@@ -303,10 +302,6 @@ abstract class PercentileBase extends 
TypedImperativeAggregate[OpenHashMap[AnyRe
        3.0
       > SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS 
tab(col);
        [2.5,7.5]
-      > SELECT _FUNC_(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL 
'10' MONTH) AS tab(col);
-       5.0
-      > SELECT _FUNC_(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND), 
(INTERVAL '10' SECOND) AS tab(col);
-       [2000000.0,5000000.0]
   """,
   group = "agg_funcs",
   since = "2.1.0")
@@ -360,13 +355,11 @@ case class Percentile(
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(col) - Returns the median of numeric or ansi interval column 
`col`.",
+  usage = "_FUNC_(col) - Returns the median of numeric column `col`.",
   examples = """
     Examples:
       > SELECT _FUNC_(col) FROM VALUES (0), (10) AS tab(col);
        5.0
-      > SELECT _FUNC_(col) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' 
MONTH) AS tab(col);
-       5.0
   """,
   group = "agg_funcs",
   since = "3.4.0")
@@ -385,7 +378,7 @@ case class Median(child: Expression)
 
 /**
  * Return a percentile value based on a continuous distribution of
- * numeric or ansi interval column at the given percentage (specified in ORDER 
BY clause).
+ * numeric column at the given percentage (specified in ORDER BY clause).
  * The value of percentage must be between 0.0 and 1.0.
  */
 case class PercentileCont(left: Expression, right: Expression, reverse: 
Boolean = false)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
index e08c6279fd6..a3ae7a44a30 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -170,8 +170,8 @@ class PercentileSuite extends SparkFunSuite {
       val child = AttributeReference("a", dataType)()
       val percentile = new Percentile(child, percentage)
       assertEqual(percentile.checkInputDataTypes(),
-        TypeCheckFailure(s"argument 1 requires (numeric or interval year to 
month or " +
-          s"interval day to second) type, however, 'a' is of 
${dataType.simpleString} type."))
+        TypeCheckFailure(s"argument 1 requires numeric type," +
+          s" however, 'a' is of ${dataType.simpleString} type."))
     }
 
     val invalidFrequencyDataTypes = Seq(FloatType, DoubleType, BooleanType,
@@ -184,8 +184,8 @@ class PercentileSuite extends SparkFunSuite {
       val frq = AttributeReference("frq", frequencyType)()
       val percentile = new Percentile(child, percentage, frq)
       assertEqual(percentile.checkInputDataTypes(),
-        TypeCheckFailure(s"argument 1 requires (numeric or interval year to 
month or " +
-          s"interval day to second) type, however, 'a' is of 
${dataType.simpleString} type."))
+        TypeCheckFailure(s"argument 1 requires numeric type," +
+          s" however, 'a' is of ${dataType.simpleString} type."))
     }
 
     for(dataType <- validDataTypes;
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
index f39f0c18024..823c1375de0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
@@ -29,21 +29,21 @@ class PercentileQuerySuite extends QueryTest with 
SharedSparkSession {
 
   private val table = "percentile_test"
 
-  test("SPARK-37138: Support Ansi Interval type in Percentile") {
+  test("SPARK-37138, SPARK-39427: Disable Ansi Interval type in Percentile") {
     withTempView(table) {
       Seq((Period.ofMonths(100), Duration.ofSeconds(100L)),
         (Period.ofMonths(200), Duration.ofSeconds(200L)),
         (Period.ofMonths(300), Duration.ofSeconds(300L)))
         .toDF("col1", "col2").createOrReplaceTempView(table)
-      checkAnswer(
+      val e = intercept[AnalysisException] {
         spark.sql(
           s"""SELECT
-             |  CAST(percentile(col1, 0.5) AS STRING),
-             |  SUM(null),
-             |  CAST(percentile(col2, 0.5) AS STRING)
-             |FROM $table
-           """.stripMargin),
-        Row("200.0", null, "2.0E8"))
+            |  CAST(percentile(col1, 0.5) AS STRING),
+            |  SUM(null),
+            |  CAST(percentile(col2, 0.5) AS STRING)
+            |FROM $table""".stripMargin).collect()
+      }
+      assert(e.getMessage.contains("data type mismatch"))
     }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to