cloud-fan commented on a change in pull request #24735: [SPARK-27871][SQL] LambdaVariable should use per-query unique IDs instead of globally unique IDs URL: https://github.com/apache/spark/pull/24735#discussion_r292763275
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/objects.scala ########## @@ -228,3 +228,31 @@ object ObjectSerializerPruning extends Rule[LogicalPlan] { } } } + +/** + * Reassigns per-query unique IDs to `LambdaVariable`s, whose original IDs are globally unique. This + * can help Spark to hit codegen cache more often and improve performance. + */ +object ReassignLambdaVariableID extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = { + if (!SQLConf.get.getConf(SQLConf.OPTIMIZER_REASSIGN_LAMBDA_VARIABLE_ID)) return plan + + // The original LambdaVariable IDs are all positive. To avoid conflicts, the new IDs are all + // negative and starts with -1. + var newId = -1L + val oldIdToNewId = scala.collection.mutable.Map.empty[Long, Long] + + plan.transformAllExpressions { Review comment: fixed. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org