Github user aokolnychyi commented on a diff in the pull request: https://github.com/apache/spark/pull/22857#discussion_r229133550 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala --- @@ -736,3 +736,65 @@ object CombineConcats extends Rule[LogicalPlan] { flattenConcats(concat) } } + +/** + * A rule that replaces `Literal(null, _)` with `FalseLiteral` for further optimizations. + * + * For example, `Filter(Literal(null, _))` is equal to `Filter(FalseLiteral)`. + * + * Another example containing branches is `Filter(If(cond, FalseLiteral, Literal(null, _)))`; + * this can be optimized to `Filter(If(cond, FalseLiteral, FalseLiteral))`, and eventually + * `Filter(FalseLiteral)`. + * + * As a result, many unnecessary computations can be removed in the query optimization phase. + * + * Similarly, the same logic can be applied to conditions in [[Join]], predicates in [[If]], + * conditions in [[CaseWhen]]. + */ +object ReplaceNullWithFalse extends Rule[LogicalPlan] { + + def apply(plan: LogicalPlan): LogicalPlan = plan transform { + case f @ Filter(cond, _) => f.copy(condition = replaceNullWithFalse(cond)) + case j @ Join(_, _, _, Some(cond)) => j.copy(condition = Some(replaceNullWithFalse(cond))) + case p: LogicalPlan => p transformExpressions { + case i @ If(pred, _, _) => i.copy(predicate = replaceNullWithFalse(pred)) + case CaseWhen(branches, elseValue) => + val newBranches = branches.map { case (cond, value) => + replaceNullWithFalse(cond) -> value + } + CaseWhen(newBranches, elseValue) + } + } + + /** + * Recursively replaces `Literal(null, _)` with `FalseLiteral`. + * + * Note that `transformExpressionsDown` can not be used here as we must stop as soon as we hit + * an expression that is not [[CaseWhen]], [[If]], [[And]], [[Or]] or `Literal(null, _)`. + */ + private def replaceNullWithFalse(e: Expression): Expression = e match { + case cw: CaseWhen if getValues(cw).forall(isNullOrBoolean) => + val newBranches = cw.branches.map { case (cond, value) => + replaceNullWithFalse(cond) -> replaceNullWithFalse(value) + } + val newElseValue = cw.elseValue.map(replaceNullWithFalse) + CaseWhen(newBranches, newElseValue) + case If(pred, trueVal, falseVal) if Seq(trueVal, falseVal).forall(isNullOrBoolean) => + If(replaceNullWithFalse(pred), replaceNullWithFalse(trueVal), replaceNullWithFalse(falseVal)) + case And(left, right) => --- End diff -- Could you elaborate a bit more on `null && false`? I had in mind `AND(true, null)` and `OR(false, null)`, which are tricky. For example, if we use `AND(true, null)` in SELECT, we will get `null`. However, if we use it inside `Filter` or predicate of `If`, it will be semantically equivalent to `false` (e.g., `If$eval`). Therefore, the proposed rule has a limited scope. I explored the source code & comments in `And/Or` to come up with an edge case that wouldnât work. I could not find such a case. To me, it seems safe because the rule is applied only to expressions that evaluate to `false` if the underlying expression is `null` (i.e., conditions in `Filter`/`Join`, predicates in `If`, conditions in `CaseWhen`). Please, let me know if you have a particular case to test.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org