peter-toth commented on code in PR #37525: URL: https://github.com/apache/spark/pull/37525#discussion_r1081448247
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/AliasAwareOutputExpression.scala: ########## @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans + +import scala.collection.mutable + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeSet, Empty2Null, Expression, NamedExpression, SortOrder} +import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection} +import org.apache.spark.sql.catalyst.trees.MultiTransformHelper +import org.apache.spark.sql.internal.SQLConf + +/** + * A trait that provides functionality to handle aliases in the `outputExpressions`. + */ +trait AliasAwareOutputExpression extends SQLConfHelper with MultiTransformHelper { + private val aliasCandidateLimit = conf.getConf(SQLConf.EXPRESSION_PROJECTION_CANDIDATE_LIMIT) + protected def outputExpressions: Seq[NamedExpression] + /** + * This method can be used to strip expression which does not affect the result, for example: + * strip the expression which is ordering agnostic for output ordering. + */ + protected def strip(expr: Expression): Expression = expr + + // Split the alias map into 2 maps, the first contains `Expression` -> `Attribute` mappings where + // any children of the `Expression` contains any other mapping. This because during + // `normalizeExpression()` we will need to handle those maps separately and don't stop generating + // alternatives at the `Expression` but we also need to traverse down to its children. + private lazy val (exprAliasMap, attrAliasMap) = { + val aliases = mutable.Map[Expression, mutable.ListBuffer[Attribute]]() + // Add aliases to the map. If multiple alias is defined for a source attribute then add all. + outputExpressions.foreach { + case a @ Alias(child, _) => + // This prepend is needed to make the first element of the `ListBuffer` point to the last + // occurrence of an aliased child. This is to keep the previous behavior and give precedence + // the last Alias during `normalizeExpression()` to avoid any kind of regression. + a.toAttribute +=: + aliases.getOrElseUpdate(strip(child.canonicalized), mutable.ListBuffer.empty) + case _ => + } + // Append identity mapping of an attribute to the map if both the attribute and its aliased + // version can be found in `outputExpressions`. + outputExpressions.foreach { + case a: Attribute if aliases.contains(a.canonicalized) => aliases(a.canonicalized) += a + case _ => + } + + aliases.partition { case (expr, _) => expr.children.exists(_.exists(aliases.contains)) } Review Comment: fixed in https://github.com/apache/spark/pull/37525/commits/a7955b58cb2cd0fe9325ca241453aca106f55af4 we now have 1 map ########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/AliasAwareOutputExpression.scala: ########## @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans + +import scala.collection.mutable + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeSet, Empty2Null, Expression, NamedExpression, SortOrder} +import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection} +import org.apache.spark.sql.catalyst.trees.MultiTransformHelper +import org.apache.spark.sql.internal.SQLConf + +/** + * A trait that provides functionality to handle aliases in the `outputExpressions`. + */ +trait AliasAwareOutputExpression extends SQLConfHelper with MultiTransformHelper { + private val aliasCandidateLimit = conf.getConf(SQLConf.EXPRESSION_PROJECTION_CANDIDATE_LIMIT) + protected def outputExpressions: Seq[NamedExpression] + /** + * This method can be used to strip expression which does not affect the result, for example: + * strip the expression which is ordering agnostic for output ordering. + */ + protected def strip(expr: Expression): Expression = expr + + // Split the alias map into 2 maps, the first contains `Expression` -> `Attribute` mappings where + // any children of the `Expression` contains any other mapping. This because during + // `normalizeExpression()` we will need to handle those maps separately and don't stop generating + // alternatives at the `Expression` but we also need to traverse down to its children. + private lazy val (exprAliasMap, attrAliasMap) = { + val aliases = mutable.Map[Expression, mutable.ListBuffer[Attribute]]() + // Add aliases to the map. If multiple alias is defined for a source attribute then add all. + outputExpressions.foreach { + case a @ Alias(child, _) => + // This prepend is needed to make the first element of the `ListBuffer` point to the last + // occurrence of an aliased child. This is to keep the previous behavior and give precedence + // the last Alias during `normalizeExpression()` to avoid any kind of regression. + a.toAttribute +=: + aliases.getOrElseUpdate(strip(child.canonicalized), mutable.ListBuffer.empty) + case _ => + } + // Append identity mapping of an attribute to the map if both the attribute and its aliased + // version can be found in `outputExpressions`. + outputExpressions.foreach { + case a: Attribute if aliases.contains(a.canonicalized) => aliases(a.canonicalized) += a + case _ => + } + + aliases.partition { case (expr, _) => expr.children.exists(_.exists(aliases.contains)) } + } + + protected def hasAlias: Boolean = attrAliasMap.nonEmpty + + /** + * Return a set of Expression which normalize the original expression to the aliased. + */ + protected def normalizeExpression(expr: Expression): Seq[Expression] = { Review Comment: done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org