[spark] branch master updated: [SPARK-44550][SQL] Enable correctness fixes for `null IN (empty list)` under ANSI

wenchen Mon, 25 Sep 2023 18:40:58 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 7741fe79e20 [SPARK-44550][SQL] Enable correctness fixes for `null IN 
(empty list)` under ANSI
7741fe79e20 is described below

commit 7741fe79e201eea42ceefedd10013680736fbbea
Author: Jack Chen <jack.c...@databricks.com>
AuthorDate: Tue Sep 26 09:40:38 2023 +0800

    [SPARK-44550][SQL] Enable correctness fixes for `null IN (empty list)` 
under ANSI
    
    ### What changes were proposed in this pull request?
    Enables the correctness fixes for `null IN (empty list)` expressions by 
default when ANSI is enabled. Under non-ANSI the old behavior remains the 
default for now. After soaking for some time under ANSI, we should switch the 
new behavior to default in both cases.
    
    Prior to this, `null IN (empty list)` incorrectly evaluated to null, when 
it should evaluate to false. (The reason it should be false is because a IN 
(b1, b2) is defined as a = b1 OR a = b2, and an empty IN list is treated as an 
empty OR which is false. This is specified by ANSI SQL.)
    
    Many places in Spark execution (In, InSet, InSubquery) and optimization 
(OptimizeIn, NullPropagation) implemented this wrong behavior. This is a 
longstanding correctness issue which has existed since null support for IN 
expressions was first added to Spark.
    
    See previous PRs where the fixes were implemented: 
https://github.com/apache/spark/pull/42007 and 
https://github.com/apache/spark/pull/42163.
    
    See [this 
doc](https://docs.google.com/document/d/1k8AY8oyT-GI04SnP7eXttPDnDj-Ek-c3luF2zL6DPNU/edit)
 for more information.
    
    ### Why are the changes needed?
    Fix wrong SQL semantics
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, fix wrong SQL semantics
    
    ### How was this patch tested?
    Unit tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #43068 from jchen5/null-in-empty-enable.
    
    Authored-by: Jack Chen <jack.c...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/expressions/predicates.scala   |  4 ++--
 .../spark/sql/catalyst/optimizer/expressions.scala    | 10 +++++-----
 .../scala/org/apache/spark/sql/internal/SQLConf.scala |  8 +++++++-
 .../scala/org/apache/spark/sql/EmptyInSuite.scala     | 19 +++++++++++++++++++
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 31b872e04ce..419d11b13a2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -469,7 +469,7 @@ case class In(value: Expression, list: Seq[Expression]) 
extends Predicate {
 
   final override val nodePatterns: Seq[TreePattern] = Seq(IN)
   private val legacyNullInEmptyBehavior =
-    SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
+    SQLConf.get.legacyNullInEmptyBehavior
 
   override lazy val canonicalized: Expression = {
     val basic = withNewChildren(children.map(_.canonicalized)).asInstanceOf[In]
@@ -626,7 +626,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends 
UnaryExpression with
 
   final override val nodePatterns: Seq[TreePattern] = Seq(INSET)
   private val legacyNullInEmptyBehavior =
-    SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
+    SQLConf.get.legacyNullInEmptyBehavior
 
   override def eval(input: InternalRow): Any = {
     if (hset.isEmpty && !legacyNullInEmptyBehavior) {
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 8a7f54093d5..90773a1eb86 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -283,7 +283,7 @@ object OptimizeIn extends Rule[LogicalPlan] {
       case In(v, list) if list.isEmpty =>
         // IN (empty list) is always false under current behavior.
         // Under legacy behavior it's null if the left side is null, otherwise 
false (SPARK-44550).
-        if (!SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)) {
+        if (!SQLConf.get.legacyNullInEmptyBehavior) {
           FalseLiteral
         } else {
           If(IsNotNull(v), FalseLiteral, Literal(null, BooleanType))
@@ -845,20 +845,20 @@ object NullPropagation extends Rule[LogicalPlan] {
 
       // If the list is empty, transform the In expression to false literal.
       case In(_, list)
-        if list.isEmpty && 
!SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR) =>
+        if list.isEmpty && !SQLConf.get.legacyNullInEmptyBehavior =>
         Literal.create(false, BooleanType)
       // If the value expression is NULL (and the list is non-empty), then 
transform the
       // In expression to null literal.
       // If the legacy flag is set, then it becomes null even if the list is 
empty (which is
       // incorrect legacy behavior)
       case In(Literal(null, _), list)
-        if list.nonEmpty || 
SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
+        if list.nonEmpty || SQLConf.get.legacyNullInEmptyBehavior
       => Literal.create(null, BooleanType)
       case InSubquery(Seq(Literal(null, _)), _)
-        if SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR) =>
+        if SQLConf.get.legacyNullInEmptyBehavior =>
         Literal.create(null, BooleanType)
       case InSubquery(Seq(Literal(null, _)), ListQuery(sub, _, _, _, 
conditions, _))
-        if !SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
+        if !SQLConf.get.legacyNullInEmptyBehavior
         && conditions.isEmpty =>
         If(Exists(sub), Literal(null, BooleanType), FalseLiteral)
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index cce85da29b6..43eb0756d8d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4361,6 +4361,8 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  // Default is false (new, correct behavior) when ANSI is on, true (legacy, 
incorrect behavior)
+  // when ANSI is off. See legacyNullInEmptyBehavior.
   val LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR =
     buildConf("spark.sql.legacy.nullInEmptyListBehavior")
       .internal()
@@ -4370,7 +4372,7 @@ object SQLConf {
         "incorrectly evaluates to null in the legacy behavior.")
       .version("3.5.0")
       .booleanConf
-      .createWithDefault(true)
+      .createOptional
 
   val ERROR_MESSAGE_FORMAT = buildConf("spark.sql.error.messageFormat")
     .doc("When PRETTY, the error message consists of textual representation of 
error class, " +
@@ -5185,6 +5187,10 @@ class SQLConf extends Serializable with Logging with 
SqlApiConf {
     getConf(SQLConf.LEGACY_SIZE_OF_NULL) && !getConf(ANSI_ENABLED)
   }
 
+  def legacyNullInEmptyBehavior: Boolean = {
+    getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR).getOrElse(!ansiEnabled)
+  }
+
   def isReplEagerEvalEnabled: Boolean = 
getConf(SQLConf.REPL_EAGER_EVAL_ENABLED)
 
   def replEagerEvalMaxNumRows: Int = 
getConf(SQLConf.REPL_EAGER_EVAL_MAX_NUM_ROWS)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
index 1534aba28c4..8e455eb4419 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
@@ -59,4 +59,23 @@ with SharedSparkSession {
       }
     }
   }
+
+  test("IN empty list behavior conf defaults") {
+    // Currently the fixed behavior is enabled when ANSI is on, and the legacy 
behavior when
+    // ANSI is off.
+    Seq(true, false).foreach { ansiEnabled =>
+      withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {
+        val legacyNullInBehavior = !ansiEnabled
+        assert(SQLConf.get.legacyNullInEmptyBehavior == legacyNullInBehavior)
+
+        val emptylist = Seq.empty[Literal]
+        val df = t.select(col("a"), col("a").isin(emptylist: _*))
+        val expectedResultForNullInEmpty =
+          if (legacyNullInBehavior) null else false
+        checkAnswer(
+          df,
+          Row(1, false) :: Row(null, expectedResultForNullInEmpty) :: Nil)
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-44550][SQL] Enable correctness fixes for `null IN (empty list)` under ANSI

Reply via email to