Repository: spark Updated Branches: refs/heads/master 1842c55d8 -> e88bff127
[SPARK-13235][SQL] Removed an Extra Distinct from the Plan when Using Union in SQL Currently, the parser added two `Distinct` operators in the plan if we are using `Union` or `Union Distinct` in the SQL. This PR is to remove the extra `Distinct` from the plan. For example, before the fix, the following query has a plan with two `Distinct` ```scala sql("select * from t0 union select * from t0").explain(true) ``` ``` == Parsed Logical Plan == 'Project [unresolvedalias(*,None)] +- 'Subquery u_2 +- 'Distinct +- 'Project [unresolvedalias(*,None)] +- 'Subquery u_1 +- 'Distinct +- 'Union :- 'Project [unresolvedalias(*,None)] : +- 'UnresolvedRelation `t0`, None +- 'Project [unresolvedalias(*,None)] +- 'UnresolvedRelation `t0`, None == Analyzed Logical Plan == id: bigint Project [id#16L] +- Subquery u_2 +- Distinct +- Project [id#16L] +- Subquery u_1 +- Distinct +- Union :- Project [id#16L] : +- Subquery t0 : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Subquery t0 +- Relation[id#16L] ParquetRelation == Optimized Logical Plan == Aggregate [id#16L], [id#16L] +- Aggregate [id#16L], [id#16L] +- Union :- Project [id#16L] : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Relation[id#16L] ParquetRelation ``` After the fix, the plan is changed without the extra `Distinct` as follows: ``` == Parsed Logical Plan == 'Project [unresolvedalias(*,None)] +- 'Subquery u_1 +- 'Distinct +- 'Union :- 'Project [unresolvedalias(*,None)] : +- 'UnresolvedRelation `t0`, None +- 'Project [unresolvedalias(*,None)] +- 'UnresolvedRelation `t0`, None == Analyzed Logical Plan == id: bigint Project [id#17L] +- Subquery u_1 +- Distinct +- Union :- Project [id#16L] : +- Subquery t0 : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Subquery t0 +- Relation[id#16L] ParquetRelation == Optimized Logical Plan == Aggregate [id#17L], [id#17L] +- Union :- Project [id#16L] : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Relation[id#16L] ParquetRelation ``` Author: gatorsmile <gatorsm...@gmail.com> Closes #11120 from gatorsmile/unionDistinct. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e88bff12 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e88bff12 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e88bff12 Branch: refs/heads/master Commit: e88bff12795a6134e2e7204996b603e948380e18 Parents: 1842c55 Author: gatorsmile <gatorsm...@gmail.com> Authored: Thu Feb 11 08:40:27 2016 +0100 Committer: Herman van Hovell <hvanhov...@questtec.nl> Committed: Thu Feb 11 08:40:27 2016 +0100 ---------------------------------------------------------------------- .../spark/sql/catalyst/parser/SparkSqlParser.g | 28 +---------------- .../spark/sql/catalyst/CatalystQlSuite.scala | 33 ++++++++++++++++++-- 2 files changed, 32 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/e88bff12/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g index 9f2a5eb..24483cc 100644 --- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g +++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g @@ -2370,34 +2370,8 @@ setOpSelectStatement[CommonTree t, boolean topLevel] u=setOperator LPAREN b=simpleSelectStatement RPAREN | u=setOperator b=simpleSelectStatement) - -> {$setOpSelectStatement.tree != null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}? - ^(TOK_QUERY - ^(TOK_FROM - ^(TOK_SUBQUERY - ^($u {$setOpSelectStatement.tree} $b) - {adaptor.create(Identifier, generateUnionAlias())} - ) - ) - ^(TOK_INSERT - ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE)) - ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF)) - ) - ) - -> {$setOpSelectStatement.tree != null && $u.tree.getType()!=SparkSqlParser.TOK_UNIONDISTINCT}? + -> {$setOpSelectStatement.tree != null}? ^($u {$setOpSelectStatement.tree} $b) - -> {$setOpSelectStatement.tree == null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}? - ^(TOK_QUERY - ^(TOK_FROM - ^(TOK_SUBQUERY - ^($u {$t} $b) - {adaptor.create(Identifier, generateUnionAlias())} - ) - ) - ^(TOK_INSERT - ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE)) - ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF)) - ) - ) -> ^($u {$t} $b) )+ o=orderByClause? http://git-wip-us.apache.org/repos/asf/spark/blob/e88bff12/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala index 682b77d..8d7d6b5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala @@ -18,10 +18,10 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction} +import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.unsafe.types.CalendarInterval class CatalystQlSuite extends PlanTest { @@ -45,6 +45,35 @@ class CatalystQlSuite extends PlanTest { comparePlans(parsed, expected) } + test("test Union Distinct operator") { + val parsed1 = parser.parsePlan("SELECT * FROM t0 UNION SELECT * FROM t1") + val parsed2 = parser.parsePlan("SELECT * FROM t0 UNION DISTINCT SELECT * FROM t1") + val expected = + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + Subquery("u_1", + Distinct( + Union( + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t0"), None)), + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t1"), None)))))) + comparePlans(parsed1, expected) + comparePlans(parsed2, expected) + } + + test("test Union All operator") { + val parsed = parser.parsePlan("SELECT * FROM t0 UNION ALL SELECT * FROM t1") + val expected = + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + Subquery("u_1", + Union( + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t0"), None)), + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t1"), None))))) + comparePlans(parsed, expected) + } + test("support hive interval literal") { def checkInterval(sql: String, result: CalendarInterval): Unit = { val parsed = parser.parsePlan(sql) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org