Re: [PR] [SPARK-47955][SQL] Improve `DeduplicateRelations` performance [spark]

via GitHub Fri, 26 Apr 2024 00:21:26 -0700


beliefer commented on code in PR #46183:
URL: https://github.com/apache/spark/pull/46183#discussion_r1580583432



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala:
##########
@@ -38,28 +38,29 @@ case class RelationWrapper(cls: Class[_], outputAttrIds: 
Seq[Long])
 object DeduplicateRelations extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = {
     val newPlan = renewDuplicatedRelations(mutable.HashSet.empty, plan)._1
-    if (newPlan.find(p => p.resolved && p.missingInput.nonEmpty).isDefined) {
-      // Wait for `ResolveMissingReferences` to resolve missing attributes 
first
-      return newPlan
-    }
+
+    def noMissingInput(p: LogicalPlan) = !p.exists(_.missingInput.nonEmpty)
+
     newPlan.resolveOperatorsUpWithPruning(
       _.containsAnyPattern(JOIN, LATERAL_JOIN, AS_OF_JOIN, INTERSECT, EXCEPT, 
UNION, COMMAND),
       ruleId) {
       case p: LogicalPlan if !p.childrenResolved => p
       // To resolve duplicate expression IDs for Join.
-      case j @ Join(left, right, _, _, _) if !j.duplicateResolved =>
+      case j @ Join(left, right, _, _, _) if !j.duplicateResolved && 
noMissingInput(right) =>
         j.copy(right = dedupRight(left, right))
       // Resolve duplicate output for LateralJoin.
-      case j @ LateralJoin(left, right, _, _) if right.resolved && 
!j.duplicateResolved =>
+      case j @ LateralJoin(left, right, _, _)
+          if right.resolved && !j.duplicateResolved && 
noMissingInput(right.plan) =>
         j.copy(right = right.withNewPlan(dedupRight(left, right.plan)))
       // Resolve duplicate output for AsOfJoin.
-      case j @ AsOfJoin(left, right, _, _, _, _, _) if !j.duplicateResolved =>
+      case j @ AsOfJoin(left, right, _, _, _, _, _)
+          if !j.duplicateResolved && noMissingInput(right) =>
         j.copy(right = dedupRight(left, right))
       // intersect/except will be rewritten to join at the beginning of 
optimizer. Here we need to
       // deduplicate the right side plan, so that we won't produce an invalid 
self-join later.
-      case i @ Intersect(left, right, _) if !i.duplicateResolved =>
+      case i @ Intersect(left, right, _) if !i.duplicateResolved && 
noMissingInput(right) =>
         i.copy(right = dedupRight(left, right))
-      case e @ Except(left, right, _) if !e.duplicateResolved =>
+      case e @ Except(left, right, _) if !e.duplicateResolved && 
noMissingInput(right) =>
         e.copy(right = dedupRight(left, right))
       // Only after we finish by-name resolution for Union
       case u: Union if !u.byName && !u.duplicateResolved =>

Review Comment:
   Got it. Thank you for your explanation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-47955][SQL] Improve `DeduplicateRelations` performance [spark]

Reply via email to