Repository: spark Updated Branches: refs/heads/master ba3309684 -> e27212317
[SPARK-8972] [SQL] Incorrect result for rollup We don't support the complex expression keys in the rollup/cube, and we even will not report it if we have the complex group by keys, that will cause very confusing/incorrect result. e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP` This PR adds an additional project during the analyzing for the complex GROUP BY keys, and that projection will be the child of `Expand`, so to `Expand`, the GROUP BY KEY are always the simple key(attribute names). Author: Cheng Hao <hao.ch...@intel.com> Closes #7343 from chenghao-intel/expand and squashes the following commits: 1ebbb59 [Cheng Hao] update the comment 827873f [Cheng Hao] update as feedback 34def69 [Cheng Hao] Add more unit test and comments c695760 [Cheng Hao] fix bug of incorrect result for rollup Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2721231 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2721231 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2721231 Branch: refs/heads/master Commit: e27212317c7341852c52d9a85137b8f94cb0d935 Parents: ba33096 Author: Cheng Hao <hao.ch...@intel.com> Authored: Wed Jul 15 23:35:27 2015 -0700 Committer: Yin Huai <yh...@databricks.com> Committed: Wed Jul 15 23:35:27 2015 -0700 ---------------------------------------------------------------------- .../spark/sql/catalyst/analysis/Analyzer.scala | 42 +++++++++++++-- ...r CUBE #1-0-63b61fb3f0e74226001ad279be440864 | 6 +++ ...r CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 | 10 ++++ ...oupingSet-0-8c14c24670a4b06c440346277ce9cf1c | 10 ++++ ...Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a | 6 +++ ...Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 | 10 ++++ ...Rollup #3-0-9257085d123728730be96b6d9fbb84ce | 10 ++++ .../sql/hive/execution/HiveQuerySuite.scala | 54 ++++++++++++++++++++ 8 files changed, 145 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 891408e..df8e7f2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -194,16 +194,52 @@ class Analyzer( } def apply(plan: LogicalPlan): LogicalPlan = plan transform { + case a if !a.childrenResolved => a // be sure all of the children are resolved. case a: Cube => GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations) case a: Rollup => GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations) case x: GroupingSets => val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)() + // We will insert another Projection if the GROUP BY keys contains the + // non-attribute expressions. And the top operators can references those + // expressions by its alias. + // e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==> + // SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a + + // find all of the non-attribute expressions in the GROUP BY keys + val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]() + + // The pair of (the original GROUP BY key, associated attribute) + val groupByExprPairs = x.groupByExprs.map(_ match { + case e: NamedExpression => (e, e.toAttribute) + case other => { + val alias = Alias(other, other.toString)() + nonAttributeGroupByExpressions += alias // add the non-attributes expression alias + (other, alias.toAttribute) + } + }) + + // substitute the non-attribute expressions for aggregations. + val aggregation = x.aggregations.map(expr => expr.transformDown { + case e => groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e) + }.asInstanceOf[NamedExpression]) + + // substitute the group by expressions. + val newGroupByExprs = groupByExprPairs.map(_._2) + + val child = if (nonAttributeGroupByExpressions.length > 0) { + // insert additional projection if contains the + // non-attribute expressions in the GROUP BY keys + Project(x.child.output ++ nonAttributeGroupByExpressions, x.child) + } else { + x.child + } + Aggregate( - x.groupByExprs :+ VirtualColumn.groupingIdAttribute, - x.aggregations, - Expand(x.bitmasks, x.groupByExprs, gid, x.child)) + newGroupByExprs :+ VirtualColumn.groupingIdAttribute, + aggregation, + Expand(x.bitmasks, newGroupByExprs, gid, child)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864 ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864 new file mode 100644 index 0000000..dac1b84 --- /dev/null +++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864 @@ -0,0 +1,6 @@ +500 NULL 0 +91 0 1 +84 1 1 +105 2 1 +113 3 1 +107 4 1 http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 new file mode 100644 index 0000000..c7cb747 --- /dev/null +++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 @@ -0,0 +1,10 @@ +1 NULL -3 2 +1 NULL -1 2 +1 NULL 3 2 +1 NULL 4 2 +1 NULL 5 2 +1 NULL 6 2 +1 NULL 12 2 +1 NULL 14 2 +1 NULL 15 2 +1 NULL 22 2 http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c new file mode 100644 index 0000000..c7cb747 --- /dev/null +++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c @@ -0,0 +1,10 @@ +1 NULL -3 2 +1 NULL -1 2 +1 NULL 3 2 +1 NULL 4 2 +1 NULL 5 2 +1 NULL 6 2 +1 NULL 12 2 +1 NULL 14 2 +1 NULL 15 2 +1 NULL 22 2 http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a new file mode 100644 index 0000000..dac1b84 --- /dev/null +++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a @@ -0,0 +1,6 @@ +500 NULL 0 +91 0 1 +84 1 1 +105 2 1 +113 3 1 +107 4 1 http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 new file mode 100644 index 0000000..1eea4a9 --- /dev/null +++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 @@ -0,0 +1,10 @@ +1 0 5 3 +1 0 15 3 +1 0 25 3 +1 0 60 3 +1 0 75 3 +1 0 80 3 +1 0 100 3 +1 0 140 3 +1 0 145 3 +1 0 150 3 http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce new file mode 100644 index 0000000..1eea4a9 --- /dev/null +++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce @@ -0,0 +1,10 @@ +1 0 5 3 +1 0 15 3 +1 0 25 3 +1 0 60 3 +1 0 75 3 +1 0 80 3 +1 0 100 3 +1 0 140 3 +1 0 145 3 +1 0 150 3 http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 991da2f..11a843b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { } } + createQueryTest("SPARK-8976 Wrong Result for Rollup #1", + """ + SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP + """.stripMargin) + + createQueryTest("SPARK-8976 Wrong Result for Rollup #2", + """ + SELECT + count(*) AS cnt, + key % 5 as k1, + key-5 as k2, + GROUPING__ID as k3 + FROM src group by key%5, key-5 + WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10 + """.stripMargin) + + createQueryTest("SPARK-8976 Wrong Result for Rollup #3", + """ + SELECT + count(*) AS cnt, + key % 5 as k1, + key-5 as k2, + GROUPING__ID as k3 + FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5 + WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10 + """.stripMargin) + + createQueryTest("SPARK-8976 Wrong Result for CUBE #1", + """ + SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH CUBE + """.stripMargin) + + createQueryTest("SPARK-8976 Wrong Result for CUBE #2", + """ + SELECT + count(*) AS cnt, + key % 5 as k1, + key-5 as k2, + GROUPING__ID as k3 + FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5 + WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10 + """.stripMargin) + + createQueryTest("SPARK-8976 Wrong Result for GroupingSet", + """ + SELECT + count(*) AS cnt, + key % 5 as k1, + key-5 as k2, + GROUPING__ID as k3 + FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5 + GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10 + """.stripMargin) + createQueryTest("insert table with generator with column name", """ | CREATE TABLE gen_tmp (key Int); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org