This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new 5e9ad2e71f9 [SPARK-42259][SQL] ResolveGroupingAnalytics should take 
care of Python UDAF
5e9ad2e71f9 is described below

commit 5e9ad2e71f9a22d2f1b2d47b2e6bee323b354eb7
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Wed Feb 1 17:36:14 2023 +0800

    [SPARK-42259][SQL] ResolveGroupingAnalytics should take care of Python UDAF
    
    ### What changes were proposed in this pull request?
    
    This is a long-standing correctness issue with Python UDAF and grouping 
analytics. The rule `ResolveGroupingAnalytics` should take care of Python UDAF 
when matching aggregate expressions.
    
    ### Why are the changes needed?
    
    bug fix
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the query result was wrong before
    
    ### How was this patch tested?
    
    existing tests
    
    Closes #39824 from cloud-fan/python.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit 1219c8492376e038894111cd5d922229260482e7)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala     |  2 +-
 .../results/udaf/udaf-group-analytics.sql.out      | 58 +++++++++++-----------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 48ea0460725..7a92c46577d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -622,7 +622,7 @@ class Analyzer(override val catalogManager: CatalogManager)
         // AggregateExpression should be computed on the unmodified value of 
its argument
         // expressions, so we should not replace any references to grouping 
expression
         // inside it.
-        case e: AggregateExpression =>
+        case e if AggregateExpression.isAggregate(e) =>
           aggsBuffer += e
           e
         case e if isPartOfAggregation(e) => e
diff --git 
a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
 
b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
index b8c94b19d81..f0be6f43642 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
@@ -15,18 +15,18 @@ SELECT a + b, b, udaf(a - b) FROM testData GROUP BY a + b, 
b WITH CUBE
 struct<(a + b):int,b:int,udaf((a - b)):int>
 -- !query output
 2      1       1
-2      NULL    0
+2      NULL    1
 3      1       1
 3      2       1
-3      NULL    0
+3      NULL    2
 4      1       1
 4      2       1
-4      NULL    0
+4      NULL    2
 5      2       1
-5      NULL    0
+5      NULL    1
 NULL   1       3
 NULL   2       3
-NULL   NULL    0
+NULL   NULL    6
 
 
 -- !query
@@ -36,16 +36,16 @@ struct<a:int,b:int,udaf(b):int>
 -- !query output
 1      1       1
 1      2       1
-1      NULL    0
+1      NULL    2
 2      1       1
 2      2       1
-2      NULL    0
+2      NULL    2
 3      1       1
 3      2       1
-3      NULL    0
+3      NULL    2
 NULL   1       3
 NULL   2       3
-NULL   NULL    0
+NULL   NULL    6
 
 
 -- !query
@@ -54,16 +54,16 @@ SELECT a + b, b, udaf(a - b) FROM testData GROUP BY a + b, 
b WITH ROLLUP
 struct<(a + b):int,b:int,udaf((a - b)):int>
 -- !query output
 2      1       1
-2      NULL    0
+2      NULL    1
 3      1       1
 3      2       1
-3      NULL    0
+3      NULL    2
 4      1       1
 4      2       1
-4      NULL    0
+4      NULL    2
 5      2       1
-5      NULL    0
-NULL   NULL    0
+5      NULL    1
+NULL   NULL    6
 
 
 -- !query
@@ -73,14 +73,14 @@ struct<a:int,b:int,udaf(b):int>
 -- !query output
 1      1       1
 1      2       1
-1      NULL    0
+1      NULL    2
 2      1       1
 2      2       1
-2      NULL    0
+2      NULL    2
 3      1       1
 3      2       1
-3      NULL    0
-NULL   NULL    0
+3      NULL    2
+NULL   NULL    6
 
 
 -- !query
@@ -416,14 +416,14 @@ GROUP BY course, earnings GROUPING SETS((), (course), 
(course, earnings)) ORDER
 -- !query schema
 struct<course:string,sum:int>
 -- !query output
-NULL   0
-Java   0
+NULL   5
 Java   1
 Java   1
-dotNET 0
+Java   2
 dotNET 1
 dotNET 1
 dotNET 1
+dotNET 3
 
 
 -- !query
@@ -432,14 +432,14 @@ GROUP BY course, earnings GROUPING SETS((), (course), 
(course, earnings)) ORDER
 -- !query schema
 struct<course:string,sum:int,grouping_id(course, earnings):bigint>
 -- !query output
-NULL   0       3
-Java   0       1
+NULL   5       3
 Java   1       0
 Java   1       0
-dotNET 0       1
+Java   2       1
 dotNET 1       0
 dotNET 1       0
 dotNET 1       0
+dotNET 3       1
 
 
 -- !query
@@ -468,16 +468,16 @@ SELECT a + b AS k, b, udaf(a - b) FROM testData GROUP BY 
ROLLUP(k, b)
 struct<k:int,b:int,udaf((a - b)):int>
 -- !query output
 2      1       1
-2      NULL    0
+2      NULL    1
 3      1       1
 3      2       1
-3      NULL    0
+3      NULL    2
 4      1       1
 4      2       1
-4      NULL    0
+4      NULL    2
 5      2       1
-5      NULL    0
-NULL   NULL    0
+5      NULL    1
+NULL   NULL    6
 
 
 -- !query


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to