[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30144: [SPARK-33229][SQL] Support GROUP BY use Separate columns and CUBE/ROLLUP

GitBox Wed, 31 Mar 2021 02:07:44 -0700


AngersZhuuuu commented on a change in pull request #30144:
URL: https://github.com/apache/spark/pull/30144#discussion_r604725030




##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -212,3 +212,29 @@ object GroupingID {
     if (SQLConf.get.integerGroupingIdEnabled) IntegerType else LongType
   }
 }
+
+
+object GroupingAnalytics {
+  def unapply(exprs: Seq[Expression]):
+  Option[(Seq[Seq[Expression]], Seq[Seq[Expression]], Seq[Expression])] = {
+
+    val (groupingSets, others) = exprs.partition(_.isInstanceOf[GroupingSet])
+    if (groupingSets.isEmpty) {
+      None
+    } else {
+      val groups =
+        groupingSets.flatMap(_.asInstanceOf[GroupingSet].groupByExprs) ++ 
others
+      val selectedGroupByExprs =
+        groupingSets.map(_.asInstanceOf[GroupingSet].selectedGroupByExprs)
+          .foldRight(Seq.empty[Seq[Expression]]) { (x, y) =>
+            if (y.isEmpty) {

Review comment:
       > We need this check? We cannot write it like this?
   > 
   > ```
   >           .foldRight(Seq.empty[Seq[Expression]]) { (x, y) =>
   >             for (a <- x; b <- y) yield b ++ a
   >           }.map(others ++ _).map(_.distinct)
   > ```
   
   Can't, since `foldRight(Seq.empty[Seq[Expresstion]])`, this empty Seq should 
be handled.
   
   Is there any other Scala collection can avoid this problem?

##########
File path: sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
##########
@@ -69,4 +69,17 @@ SELECT course, year FROM courseSales GROUP BY CUBE(course, 
year) ORDER BY groupi
 -- Aliases in SELECT could be used in ROLLUP/CUBE/GROUPING SETS
 SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2);
 SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b);
-SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING 
SETS(k)
+SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING 
SETS(k);
+
+-- GROUP BY use mixed Separate columns and CUBE/ROLLUP/Gr
+SELECT a, b, count(1) FROM testData GROUP BY a, b, CUBE(a, b);
+SELECT a, b, count(1) FROM testData GROUP BY a, b, ROLLUP(a, b);
+SELECT a, b, count(1) FROM testData GROUP BY CUBE(a, b), ROLLUP(a, b);
+SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), ROLLUP(b);
+SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), (a), ());
+SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), GROUPING SETS((a, 
b), (a), ());
+SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), ROLLUP(a, b), 
GROUPING SETS((a, b), (a), ());
+

Review comment:
       > nit: we don't need the two blank lines.
   
   Done

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
##########
@@ -3714,6 +3714,32 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
     }
   }
 
+  test("SPARK-33229: Support GROUP BY use Separate columns and CUBE/ROLLUP") {

Review comment:
       > We still need this test?
   
   Removed

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -212,3 +212,29 @@ object GroupingID {
     if (SQLConf.get.integerGroupingIdEnabled) IntegerType else LongType
   }
 }
+
+
+object GroupingAnalytics {
+  def unapply(exprs: Seq[Expression]):
+  Option[(Seq[Seq[Expression]], Seq[Seq[Expression]], Seq[Expression])] = {
+
+    val (groupingSets, others) = exprs.partition(_.isInstanceOf[GroupingSet])
+    if (groupingSets.isEmpty) {
+      None
+    } else {
+      val groups =
+        groupingSets.flatMap(_.asInstanceOf[GroupingSet].groupByExprs) ++ 
others

Review comment:
       > Since the cast `.asInstanceOf[GroupingSet]` appears three times, could 
you cast it only once at the beginning?
   
   Done

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -212,3 +212,29 @@ object GroupingID {
     if (SQLConf.get.integerGroupingIdEnabled) IntegerType else LongType
   }
 }
+
+
+object GroupingAnalytics {
+  def unapply(exprs: Seq[Expression]):
+  Option[(Seq[Seq[Expression]], Seq[Seq[Expression]], Seq[Expression])] = {
+

Review comment:
       > nit: remove the unnecessary blank.
   
   DOne

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -212,3 +212,29 @@ object GroupingID {
     if (SQLConf.get.integerGroupingIdEnabled) IntegerType else LongType
   }
 }
+
+
+object GroupingAnalytics {
+  def unapply(exprs: Seq[Expression]):
+  Option[(Seq[Seq[Expression]], Seq[Seq[Expression]], Seq[Expression])] = {

Review comment:
       > ```
   >   def unapply(exprs: Seq[Expression])
   >   : Option[(Seq[Seq[Expression]], Seq[Seq[Expression]], Seq[Expression])] 
= {
   > ```
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30144: [SPARK-33229][SQL] Support GROUP BY use Separate columns and CUBE/ROLLUP

Reply via email to