Github user hvanhovell commented on a diff in the pull request: https://github.com/apache/spark/pull/16757#discussion_r99382442 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala --- @@ -154,56 +155,108 @@ class SimpleTestOptimizer extends Optimizer( new SimpleCatalystConf(caseSensitiveAnalysis = true)) /** - * Removes the Project only conducting Alias of its child node. - * It is created mainly for removing extra Project added in EliminateSerialization rule, - * but can also benefit other operators. + * Remove redundant aliases from a query plan. A redundant alias is an alias that does not change + * the name or metadata of a column, and does not deduplicate it. */ -object RemoveAliasOnlyProject extends Rule[LogicalPlan] { +object RemoveRedundantAliases extends Rule[LogicalPlan] { + /** - * Returns true if the project list is semantically same as child output, after strip alias on - * attribute. + * Replace the attributes in an expression using the given mapping. */ - private def isAliasOnly( - projectList: Seq[NamedExpression], - childOutput: Seq[Attribute]): Boolean = { - if (projectList.length != childOutput.length) { - false - } else { - stripAliasOnAttribute(projectList).zip(childOutput).forall { - case (a: Attribute, o) if a semanticEquals o => true - case _ => false - } + private def createAttributeMapping(current: LogicalPlan, next: LogicalPlan) + : Seq[(Attribute, Attribute)] = { + current.output.zip(next.output).filterNot { + case (a1, a2) => a1.semanticEquals(a2) } } - private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = { - projectList.map { - // Alias with metadata can not be stripped, or the metadata will be lost. - // If the alias name is different from attribute name, we can't strip it either, or we may - // accidentally change the output schema name of the root plan. - case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name => - attr - case other => other - } + /** + * Remove the top-level alias from an expression when it is redundant. + */ + private def removeRedundantAlias(e: Expression, blacklist: AttributeSet): Expression = e match { + // Alias with metadata can not be stripped, or the metadata will be lost. + // If the alias name is different from attribute name, we can't strip it either, or we + // may accidentally change the output schema name of the root plan. + case a @ Alias(attr: Attribute, name) + if a.metadata == Metadata.empty && name == attr.name && !blacklist.contains(attr) => + attr + case a => a } - def apply(plan: LogicalPlan): LogicalPlan = { - val aliasOnlyProject = plan.collectFirst { - case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p + /** + * Get an appropriate alias cleaning method for the given node. + * + * We currently clean Project, Aggregate & Window nodes. --- End diff -- Yeah that is an improvement. I added all LogicalPlan nodes that are producing new attributes using named expressions. I will inline this method.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org