spark git commit: [SPARK-8972] [SQL] Incorrect result for rollup

yhuai Wed, 15 Jul 2015 23:36:08 -0700

Repository: spark
Updated Branches:
  refs/heads/master ba3309684 -> e27212317



[SPARK-8972] [SQL] Incorrect result for rollup

We don't support the complex expression keys in the rollup/cube, and we even 
will not report it if we have the complex group by keys, that will cause very 
confusing/incorrect result.

e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP`

This PR adds an additional project during the analyzing for the complex GROUP 
BY keys, and that projection will be the child of `Expand`, so to `Expand`, the 
GROUP BY KEY are always the simple key(attribute names).

Author: Cheng Hao <hao.ch...@intel.com>

Closes #7343 from chenghao-intel/expand and squashes the following commits:

1ebbb59 [Cheng Hao] update the comment
827873f [Cheng Hao] update as feedback
34def69 [Cheng Hao] Add more unit test and comments
c695760 [Cheng Hao] fix bug of incorrect result for rollup


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2721231
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2721231
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2721231

Branch: refs/heads/master
Commit: e27212317c7341852c52d9a85137b8f94cb0d935
Parents: ba33096
Author: Cheng Hao <hao.ch...@intel.com>
Authored: Wed Jul 15 23:35:27 2015 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Wed Jul 15 23:35:27 2015 -0700

----------------------------------------------------------------------
 .../spark/sql/catalyst/analysis/Analyzer.scala  | 42 +++++++++++++--
 ...r CUBE #1-0-63b61fb3f0e74226001ad279be440864 |  6 +++
 ...r CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 | 10 ++++
 ...oupingSet-0-8c14c24670a4b06c440346277ce9cf1c | 10 ++++
 ...Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a |  6 +++
 ...Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 | 10 ++++
 ...Rollup #3-0-9257085d123728730be96b6d9fbb84ce | 10 ++++
 .../sql/hive/execution/HiveQuerySuite.scala     | 54 ++++++++++++++++++++
 8 files changed, 145 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 891408e..df8e7f2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -194,16 +194,52 @@ class Analyzer(
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case a if !a.childrenResolved => a // be sure all of the children are 
resolved.
       case a: Cube =>
         GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
       case a: Rollup =>
         GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
       case x: GroupingSets =>
         val gid = AttributeReference(VirtualColumn.groupingIdName, 
IntegerType, false)()
+        // We will insert another Projection if the GROUP BY keys contains the
+        // non-attribute expressions. And the top operators can references 
those
+        // expressions by its alias.
+        // e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==>
+        //      SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a
+
+        // find all of the non-attribute expressions in the GROUP BY keys
+        val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]()
+
+        // The pair of (the original GROUP BY key, associated attribute)
+        val groupByExprPairs = x.groupByExprs.map(_ match {
+          case e: NamedExpression => (e, e.toAttribute)
+          case other => {
+            val alias = Alias(other, other.toString)()
+            nonAttributeGroupByExpressions += alias // add the non-attributes 
expression alias
+            (other, alias.toAttribute)
+          }
+        })
+
+        // substitute the non-attribute expressions for aggregations.
+        val aggregation = x.aggregations.map(expr => expr.transformDown {
+          case e => 
groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e)
+        }.asInstanceOf[NamedExpression])
+
+        // substitute the group by expressions.
+        val newGroupByExprs = groupByExprPairs.map(_._2)
+
+        val child = if (nonAttributeGroupByExpressions.length > 0) {
+          // insert additional projection if contains the
+          // non-attribute expressions in the GROUP BY keys
+          Project(x.child.output ++ nonAttributeGroupByExpressions, x.child)
+        } else {
+          x.child
+        }
+
         Aggregate(
-          x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
-          x.aggregations,
-          Expand(x.bitmasks, x.groupByExprs, gid, x.child))
+          newGroupByExprs :+ VirtualColumn.groupingIdAttribute,
+          aggregation,
+          Expand(x.bitmasks, newGroupByExprs, gid, child))
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976
 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
CUBE #1-0-63b61fb3f0e74226001ad279be440864 
b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE 
#1-0-63b61fb3f0e74226001ad279be440864
new file mode 100644
index 0000000..dac1b84
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE 
#1-0-63b61fb3f0e74226001ad279be440864 
@@ -0,0 +1,6 @@
+500    NULL    0
+91     0       1
+84     1       1
+105    2       1
+113    3       1
+107    4       1

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976
 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 
b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE 
#2-0-7a511f02a16f0af4f810b1666cfcd896
new file mode 100644
index 0000000..c7cb747
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE 
#2-0-7a511f02a16f0af4f810b1666cfcd896 
@@ -0,0 +1,10 @@
+1      NULL    -3      2
+1      NULL    -1      2
+1      NULL    3       2
+1      NULL    4       2
+1      NULL    5       2
+1      NULL    6       2
+1      NULL    12      2
+1      NULL    14      2
+1      NULL    15      2
+1      NULL    22      2

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976
 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c 
b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
new file mode 100644
index 0000000..c7cb747
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c     
@@ -0,0 +1,10 @@
+1      NULL    -3      2
+1      NULL    -1      2
+1      NULL    3       2
+1      NULL    4       2
+1      NULL    5       2
+1      NULL    6       2
+1      NULL    12      2
+1      NULL    14      2
+1      NULL    15      2
+1      NULL    22      2

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976
 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a 
b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup 
#1-0-a78e3dbf242f240249e36b3d3fd0926a
new file mode 100644
index 0000000..dac1b84
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup 
#1-0-a78e3dbf242f240249e36b3d3fd0926a       
@@ -0,0 +1,6 @@
+500    NULL    0
+91     0       1
+84     1       1
+105    2       1
+113    3       1
+107    4       1

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976
 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 
b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup 
#2-0-bf180c9d1a18f61b9d9f31bb0115cf89
new file mode 100644
index 0000000..1eea4a9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup 
#2-0-bf180c9d1a18f61b9d9f31bb0115cf89       
@@ -0,0 +1,10 @@
+1      0       5       3
+1      0       15      3
+1      0       25      3
+1      0       60      3
+1      0       75      3
+1      0       80      3
+1      0       100     3
+1      0       140     3
+1      0       145     3
+1      0       150     3

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976
 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for 
Rollup #3-0-9257085d123728730be96b6d9fbb84ce 
b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup 
#3-0-9257085d123728730be96b6d9fbb84ce
new file mode 100644
index 0000000..1eea4a9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup 
#3-0-9257085d123728730be96b6d9fbb84ce       
@@ -0,0 +1,10 @@
+1      0       5       3
+1      0       15      3
+1      0       25      3
+1      0       60      3
+1      0       75      3
+1      0       80      3
+1      0       100     3
+1      0       140     3
+1      0       145     3
+1      0       150     3

http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 991da2f..11a843b 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with 
BeforeAndAfter {
     }
   }
 
+  createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
+    """
+      SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 
WITH ROLLUP
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for Rollup #2",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM src group by key%5, key-5
+      WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for Rollup #3",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+      WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for CUBE #1",
+    """
+      SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 
WITH CUBE
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for CUBE #2",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+      WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for GroupingSet",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+      GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
   createQueryTest("insert table with generator with column name",
     """
       |  CREATE TABLE gen_tmp (key Int);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8972] [SQL] Incorrect result for rollup

Reply via email to