cloud-fan commented on code in PR #37165: URL: https://github.com/apache/spark/pull/37165#discussion_r919620796
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala: ########## @@ -1011,24 +1011,92 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper { .forall { case (reference, count) => val producer = producerMap.getOrElse(reference, reference) - producer.deterministic && (count == 1 || alwaysInline || { - val relatedConsumers = consumers.filter(_.references.contains(reference)) - // It's still exactly-only if there is only one reference in non-extract expressions, - // as we won't duplicate the expensive CreateStruct-like expressions. - val extractOnly = relatedConsumers.map(refCountInNonExtract(_, reference)).sum <= 1 - shouldInline(producer, extractOnly) - }) + val relatedConsumers = consumers.filter(_.references.contains(reference)) + + def cheapToInlineProducer: Boolean = trimAliases(producer) match { + // These collection creation functions are not cheap as a producer, but we have + // optimizer rules that can optimize them out if they are only consumed by + // ExtractValue (See SimplifyExtractValueOps), so we need to allow to inline them to + // avoid perf regression. As an example: + // Project(s.a, s.b, Project(create_struct(a, b, c) as s, child)) + // We should collapse these two projects and eventually get Project(a, b, child) + case e @ (_: CreateNamedStruct | _: UpdateFields | _: CreateMap | _: CreateArray) => + // We can inline the collection creation producer if at most one of its access + // is non-cheap. Cheap access here means the access can be optimized by + // `SimplifyExtractValueOps` and become a cheap expression. For example, + // `create_struct(a, b, c).a` is a cheap access as it can be optimized to `a`. + // For a query: + // Project(s.a, s, Project(create_struct(a, b, c) as s, child)) + // We should collapse these two projects and eventually get Review Comment: The above one is more about an overview. This one is for the algorithm details. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org