[GitHub] [spark] maropu commented on a change in pull request #30212: [SPARK-33308][SQL] Support CUBE(...) and ROLLUP(...), GROUPING SETS(...) as group by expr in parser level

GitBox Tue, 24 Nov 2020 03:56:59 -0800


maropu commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r529480121




##########
File path: 
sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
##########
@@ -587,13 +587,24 @@ fromClause
     ;
 
 aggregationClause
-    : GROUP BY groupingExpressions+=expression (',' 
groupingExpressions+=expression)* (
+    : GROUP BY groupingExpressionWithGroupingAnalytics+=groupByClause
+      (',' groupingExpressionWithGroupingAnalytics+=groupByClause)*
+    | GROUP BY groupingExpressions+=expression (',' 
groupingExpressions+=expression)* (
       WITH kind=ROLLUP
     | WITH kind=CUBE
     | kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')')?

Review comment:
       Why do we still need this def?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##########
@@ -665,13 +676,15 @@ class Analyzer(
     // CUBE/ROLLUP/GROUPING SETS. This also replace grouping()/grouping_id() 
in resolved
     // Filter/Sort.
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsDown {
-      case h @ UnresolvedHaving(
-          _, agg @ Aggregate(Seq(c @ Cube(groupByExprs)), 
aggregateExpressions, _))
-          if agg.childrenResolved && (groupByExprs ++ 
aggregateExpressions).forall(_.resolved) =>
+      case h @ UnresolvedHaving(_, agg @ Aggregate(Seq(c @ Cube(_)), 
aggregateExpressions, _))
+          if agg.childrenResolved && (c.groupByExprs ++ 
aggregateExpressions).forall(_.resolved) =>
+        tryResolveHavingCondition(h)
+      case h @ UnresolvedHaving(_, agg @ Aggregate(Seq(r @ Rollup(_)), 
aggregateExpressions, _))
+          if agg.childrenResolved && (r.groupByExprs ++ 
aggregateExpressions).forall(_.resolved) =>
         tryResolveHavingCondition(h)
       case h @ UnresolvedHaving(
-          _, agg @ Aggregate(Seq(r @ Rollup(groupByExprs)), 
aggregateExpressions, _))
-          if agg.childrenResolved && (groupByExprs ++ 
aggregateExpressions).forall(_.resolved) =>
+      _, agg @ Aggregate(Seq(gs @ GroupingSetsV2(_, _)), aggregateExpressions, 
_))

Review comment:
       wrong indents.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##########
@@ -1971,65 +2025,84 @@ class Analyzer(
    */
   object ResolveFunctions extends Rule[LogicalPlan] {
     val trimWarningEnabled = new AtomicBoolean(true)
+
+    def resolveFunction(): PartialFunction[Expression, Expression] = {
+      case u if !u.childrenResolved => u // Skip until children are resolved.
+      case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+        withPosition(u) {
+          Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+        }
+      case u@UnresolvedGenerator(name, children) =>
+        withPosition(u) {
+          v1SessionCatalog.lookupFunction(name, children) match {
+            case generator: Generator => generator
+            case other =>
+              failAnalysis(s"$name is expected to be a generator. However, " +
+                s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+          }
+        }
+      case u@UnresolvedFunction(funcId, arguments, isDistinct, filter) =>

Review comment:
       ditto

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##########
@@ -1971,65 +2025,84 @@ class Analyzer(
    */
   object ResolveFunctions extends Rule[LogicalPlan] {
     val trimWarningEnabled = new AtomicBoolean(true)
+
+    def resolveFunction(): PartialFunction[Expression, Expression] = {
+      case u if !u.childrenResolved => u // Skip until children are resolved.
+      case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+        withPosition(u) {
+          Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+        }
+      case u@UnresolvedGenerator(name, children) =>

Review comment:
       `u@UnresolvedGenerator` -> `u @ UnresolvedGenerator`

##########
File path: 
sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
##########
@@ -587,13 +587,24 @@ fromClause
     ;
 
 aggregationClause
-    : GROUP BY groupingExpressions+=expression (',' 
groupingExpressions+=expression)* (
+    : GROUP BY groupingExpressionWithGroupingAnalytics+=groupByClause
+      (',' groupingExpressionWithGroupingAnalytics+=groupByClause)*
+    | GROUP BY groupingExpressions+=expression (',' 
groupingExpressions+=expression)* (
       WITH kind=ROLLUP
     | WITH kind=CUBE
     | kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')')?
     | GROUP BY kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')'
     ;
 
+groupByClause
+    : groupingAnalytics
+    | expression
+    ;
+
+groupingAnalytics
+    : (ROLLUP | CUBE | GROUPING SETS)  '(' groupingSet (',' groupingSet)* ')'

Review comment:
       If this PR includes the `groupingSet` support for `CUBE`/`ROLLUP`, 
please add tests for the new feature.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##########
@@ -1971,65 +2025,84 @@ class Analyzer(
    */
   object ResolveFunctions extends Rule[LogicalPlan] {
     val trimWarningEnabled = new AtomicBoolean(true)
+
+    def resolveFunction(): PartialFunction[Expression, Expression] = {
+      case u if !u.childrenResolved => u // Skip until children are resolved.
+      case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+        withPosition(u) {
+          Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+        }
+      case u@UnresolvedGenerator(name, children) =>
+        withPosition(u) {
+          v1SessionCatalog.lookupFunction(name, children) match {
+            case generator: Generator => generator
+            case other =>
+              failAnalysis(s"$name is expected to be a generator. However, " +
+                s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+          }
+        }
+      case u@UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
+        withPosition(u) {
+          v1SessionCatalog.lookupFunction(funcId, arguments) match {
+            // AggregateWindowFunctions are AggregateFunctions that can only 
be evaluated within
+            // the context of a Window clause. They do not need to be wrapped 
in an
+            // AggregateExpression.
+            case wf: AggregateWindowFunction =>
+              if (isDistinct || filter.isDefined) {
+                failAnalysis("DISTINCT or FILTER specified, " +
+                  s"but ${wf.prettyName} is not an aggregate function")
+              } else {
+                wf
+              }
+            // We get an aggregate function, we need to wrap it in an 
AggregateExpression.
+            case agg: AggregateFunction =>
+              if (filter.isDefined && !filter.get.deterministic) {
+                failAnalysis("FILTER expression is non-deterministic, " +
+                  "it cannot be used in aggregate functions")
+              }
+              AggregateExpression(agg, Complete, isDistinct, filter)
+            // This function is not an aggregate function, just return the 
resolved one.
+            case other if (isDistinct || filter.isDefined) =>
+              failAnalysis("DISTINCT or FILTER specified, " +
+                s"but ${other.prettyName} is not an aggregate function")
+            case e: String2TrimExpression if arguments.size == 2 =>
+              if (trimWarningEnabled.get) {
+                log.warn("Two-parameter TRIM/LTRIM/RTRIM function signatures 
are deprecated." +

Review comment:
       (I know this is not related to this PR though) `logWarning`?

##########
File path: 
sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
##########
@@ -587,13 +587,24 @@ fromClause
     ;
 
 aggregationClause
-    : GROUP BY groupingExpressions+=expression (',' 
groupingExpressions+=expression)* (
+    : GROUP BY groupingExpressionWithGroupingAnalytics+=groupByClause
+      (',' groupingExpressionWithGroupingAnalytics+=groupByClause)*
+    | GROUP BY groupingExpressions+=expression (',' 
groupingExpressions+=expression)* (
       WITH kind=ROLLUP
     | WITH kind=CUBE
     | kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')')?
     | GROUP BY kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')'
     ;
 
+groupByClause
+    : groupingAnalytics
+    | expression
+    ;
+
+groupingAnalytics
+    : (ROLLUP | CUBE | GROUPING SETS)  '(' groupingSet (',' groupingSet)* ')'

Review comment:
       `(ROLLUP | CUBE | GROUPING SETS) ` => `kind=(ROLLUP | CUBE | GROUPING 
SETS) `?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +39,22 @@ trait GroupingSet extends Expression with CodegenFallback {
   override def eval(input: InternalRow): Any = throw new 
UnsupportedOperationException
 }
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   2       1
-        NULL   NULL    2
-        Bob    NULL    1
-        NULL   5       1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+case class Cube(groupingSets: Seq[Seq[Expression]]) extends GroupingSet {
+  override def groupByExprs: Seq[Expression] =
+    groupingSets.flatMap(_.distinct).distinct
+}
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional rollup using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   NULL    2
-        Bob    NULL    1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {}
+case class Rollup(groupingSets: Seq[Seq[Expression]]) extends GroupingSet {
+  override def groupByExprs: Seq[Expression] =
+    groupingSets.flatMap(_.distinct).distinct
+}
+
+case class GroupingSetsV2(

Review comment:
       Why does `GroupingSets` and `GroupingSetsV2` need to co-exist?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] maropu commented on a change in pull request #30212: [SPARK-33308][SQL] Support CUBE(...) and ROLLUP(...), GROUPING SETS(...) as group by expr in parser level

Reply via email to