Repository: spark
Updated Branches:
  refs/heads/master 39c836e97 -> db7317ac3


[SPARK-16448] RemoveAliasOnlyProject should not remove alias with metadata

## What changes were proposed in this pull request?

`Alias` with metadata is not a no-op and we should not strip it in 
`RemoveAliasOnlyProject` rule.
This PR also did some improvement for this rule:

1. extend the semantic of `alias-only`. Now we allow the project list to be 
partially aliased.
2. add unit test for this rule.

## How was this patch tested?

new `RemoveAliasOnlyProjectSuite`

Author: Wenchen Fan <wenc...@databricks.com>

Closes #14106 from cloud-fan/bug.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db7317ac
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db7317ac
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db7317ac

Branch: refs/heads/master
Commit: db7317ac3c2fd2a11088d10060f168178dc99664
Parents: 39c836e
Author: Wenchen Fan <wenc...@databricks.com>
Authored: Thu Jul 14 15:48:22 2016 +0800
Committer: Cheng Lian <l...@databricks.com>
Committed: Thu Jul 14 15:48:22 2016 +0800

----------------------------------------------------------------------
 .../sql/catalyst/optimizer/Optimizer.scala      | 49 ++++++++-----
 .../optimizer/RemoveAliasOnlyProjectSuite.scala | 77 ++++++++++++++++++++
 2 files changed, 108 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/db7317ac/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 08fb019..c8e9d8e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -165,36 +165,49 @@ object PushProjectThroughSample extends Rule[LogicalPlan] 
{
  * but can also benefit other operators.
  */
 object RemoveAliasOnlyProject extends Rule[LogicalPlan] {
-  // Check if projectList in the Project node has the same attribute names and 
ordering
-  // as its child node.
+  /**
+   * Returns true if the project list is semantically same as child output, 
after strip alias on
+   * attribute.
+   */
   private def isAliasOnly(
       projectList: Seq[NamedExpression],
       childOutput: Seq[Attribute]): Boolean = {
-    if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != 
childOutput.length) {
+    if (projectList.length != childOutput.length) {
       false
     } else {
-      projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case 
(a, o) =>
-        a.child match {
-          case attr: Attribute if a.name == attr.name && 
attr.semanticEquals(o) => true
-          case _ => false
-        }
+      stripAliasOnAttribute(projectList).zip(childOutput).forall {
+        case (a: Attribute, o) if a semanticEquals o => true
+        case _ => false
       }
     }
   }
 
+  private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = {
+    projectList.map {
+      // Alias with metadata can not be stripped, or the metadata will be lost.
+      // If the alias name is different from attribute name, we can't strip it 
either, or we may
+      // accidentally change the output schema name of the root plan.
+      case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && 
name == attr.name =>
+        attr
+      case other => other
+    }
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = {
-    val aliasOnlyProject = plan.find {
-      case Project(pList, child) if isAliasOnly(pList, child.output) => true
-      case _ => false
+    val aliasOnlyProject = plan.collectFirst {
+      case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p
     }
 
-    aliasOnlyProject.map { case p: Project =>
-      val aliases = p.projectList.map(_.asInstanceOf[Alias])
-      val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child)))
-      plan.transformAllExpressions {
-        case a: Attribute if attrMap.contains(a) => attrMap(a)
-      }.transform {
-        case op: Project if op.eq(p) => op.child
+    aliasOnlyProject.map { case proj =>
+      val attributesToReplace = proj.output.zip(proj.child.output).filterNot {
+        case (a1, a2) => a1 semanticEquals a2
+      }
+      val attrMap = AttributeMap(attributesToReplace)
+      plan transform {
+        case plan: Project if plan eq proj => plan.child
+        case plan => plan transformExpressions {
+          case a: Attribute if attrMap.contains(a) => attrMap(a)
+        }
       }
     }.getOrElse(plan)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/db7317ac/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
new file mode 100644
index 0000000..7c26cb5
--- /dev/null
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.types.MetadataBuilder
+
+class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), 
RemoveAliasOnlyProject) :: Nil
+  }
+
+  test("all expressions in project list are aliased child output") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.select('a as 'a, 'b as 'b).analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, relation)
+  }
+
+  test("all expressions in project list are aliased child output but with 
different order") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.select('b as 'b, 'a as 'a).analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, query)
+  }
+
+  test("some expressions in project list are aliased child output") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.select('a as 'a, 'b).analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, relation)
+  }
+
+  test("some expressions in project list are aliased child output but with 
different order") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.select('b as 'b, 'a).analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, query)
+  }
+
+  test("some expressions in project list are not Alias or Attribute") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.select('a as 'a, 'b + 1).analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, query)
+  }
+
+  test("some expressions in project list are aliased child output but with 
metadata") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val metadata = new MetadataBuilder().putString("x", "y").build()
+    val aliasWithMeta = Alias('a, "a")(explicitMetadata = Some(metadata))
+    val query = relation.select(aliasWithMeta, 'b).analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, query)
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to