cloud-fan commented on code in PR #46263: URL: https://github.com/apache/spark/pull/46263#discussion_r1584077334
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala: ########## @@ -129,17 +136,36 @@ object InjectRuntimeFilter extends Rule[LogicalPlan] with PredicateHelper with J if (left.output.exists(_.semanticEquals(targetKey))) { extract(left, AttributeSet.empty, hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = left, targetKey = targetKey).orElse { - // We can also extract from the right side if the join keys are transitive. - lkeys.zip(rkeys).find(_._1.semanticEquals(targetKey)).map(_._2) - .flatMap { newTargetKey => - extract(right, AttributeSet.empty, - hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = right, - targetKey = newTargetKey) - } + // For the exact join key match, like the left table here, it's always OK to generate + // the runtime filter using this left table, no matter what the join type is. + // This is because left table always produce a superset of output of the join output + // regarding the left keys. + // For transitive join key match, it's different. The right table here does + // not always generate a superset output regarding left keys. + // Let's look at an example + // left table: 1, 2, 3 + // right table, 3, 4 + // left outer join output: (1, null), (2, null), (3, 3) + // left keys: 1, 2, 3 + // So we can't use right table to generate runtime filter. + if (canExtractRight(joinType)) { + lkeys.zip(rkeys).find(_._1.semanticEquals(targetKey)).map(_._2) + .flatMap { newTargetKey => + extract(right, AttributeSet.empty, + hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = right, + targetKey = newTargetKey) + } + } else { + None + } } } else if (right.output.exists(_.semanticEquals(targetKey))) { - extract(right, AttributeSet.empty, hasHitFilter = false, hasHitSelectiveFilter = false, - currentPlan = right, targetKey = targetKey).orElse { + if (canExtractRight(joinType)) { Review Comment: I think the code is wrong here. We only need this extra check for transitive join keys. In this branch, it's the left table. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org