[ 
https://issues.apache.org/jira/browse/SPARK-36686?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Andrew updated SPARK-36686:
---------------------------
    Description: 
SimplifyConditionalsInPredicate rule is not null-safe and leads to incorrect 
results

 

Reproducible:

import org.apache.spark.sql.types.\{StructField, BooleanType, StructType}
 import org.apache.spark.sql.Row

val schema = List(
 StructField("b", BooleanType, true)
 )
 val data = Seq(
 Row(true),
 Row(false),
 Row(null)
 )
 val df = spark.createDataFrame(
 spark.sparkContext.parallelize(data),
 StructType(schema)
 )

// cartesian product of true / false / null
 val df2 = df.select(col("b") as "cond").crossJoin(df.select(col("b") as 
"falseVal"))
 df2.createOrReplaceTempView("df2")

expected:

spark.sql("SELECT (IF(cond, FALSE, falseVal) <=> TRUE) FROM df2").show()

+--------------------------------------+
|((IF(cond, false, falseVal)) <=> true)|

+--------------------------------------+
|false|
|false|
|false|
|true|
|false|
|false|
|true|
|false|
|false|

+--------------------------------------+

actual (rewrite by ):

spark.sql("SELECT (AND(NOT(cond), falseVal) <=> TRUE) FROM df2").show()

+------------------------------------+
|(((NOT cond) AND falseVal) <=> true)|

+------------------------------------+
|false|
|false|
|false|
|true|
|false|
|false|
|false|
|false|
|false|

+------------------------------------+

  was:
SimplifyConditionalsInPredicate rule is not null-safe and leads to incorrect 
results

 

Reproducible:

import org.apache.spark.sql.types.\{StructField, BooleanType, StructType}
 import org.apache.spark.sql.Row

val schema = List(
 StructField("b", BooleanType, true)
 )
 val data = Seq(
 Row(true),
 Row(false),
 Row(null)
 )
 val df = spark.createDataFrame(
 spark.sparkContext.parallelize(data),
 StructType(schema)
 )

// cartesian product of true / false / null
 val df2 = df.select(col("b") as "cond").crossJoin(df.select(col("b") as 
"falseVal"))
 df2.createOrReplaceTempView("df2")

expected:

spark.sql("SELECT (IF(cond, FALSE, falseVal) <=> TRUE) FROM df2").show()

+--------------------------------------+
|((IF(cond, false, falseVal)) <=> true)|

+--------------------------------------+
|false|
|false|
|false|
|true|
|false|
|false|
|true|
|false|
|false|

+--------------------------------------+

actual (rewrite by SimplifyConditionalsInPredicate):

spark.sql("SELECT (AND(NOT(cond), falseVal) <=> TRUE) FROM df2").show()

+------------------------------------+
|(((NOT cond) AND falseVal) <=> true)|

+------------------------------------+
|false|
|false|
|false|
|true|
|false|
|false|
|false|
|false|
|false|

+------------------------------------+


> Fix SimplifyConditionalsInPredicate to be null-safe
> ---------------------------------------------------
>
>                 Key: SPARK-36686
>                 URL: https://issues.apache.org/jira/browse/SPARK-36686
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.1.2
>            Reporter: Andrew
>            Priority: Major
>
> SimplifyConditionalsInPredicate rule is not null-safe and leads to incorrect 
> results
>  
> Reproducible:
> import org.apache.spark.sql.types.\{StructField, BooleanType, StructType}
>  import org.apache.spark.sql.Row
> val schema = List(
>  StructField("b", BooleanType, true)
>  )
>  val data = Seq(
>  Row(true),
>  Row(false),
>  Row(null)
>  )
>  val df = spark.createDataFrame(
>  spark.sparkContext.parallelize(data),
>  StructType(schema)
>  )
> // cartesian product of true / false / null
>  val df2 = df.select(col("b") as "cond").crossJoin(df.select(col("b") as 
> "falseVal"))
>  df2.createOrReplaceTempView("df2")
> expected:
> spark.sql("SELECT (IF(cond, FALSE, falseVal) <=> TRUE) FROM df2").show()
> +--------------------------------------+
> |((IF(cond, false, falseVal)) <=> true)|
> +--------------------------------------+
> |false|
> |false|
> |false|
> |true|
> |false|
> |false|
> |true|
> |false|
> |false|
> +--------------------------------------+
> actual (rewrite by ):
> spark.sql("SELECT (AND(NOT(cond), falseVal) <=> TRUE) FROM df2").show()
> +------------------------------------+
> |(((NOT cond) AND falseVal) <=> true)|
> +------------------------------------+
> |false|
> |false|
> |false|
> |true|
> |false|
> |false|
> |false|
> |false|
> |false|
> +------------------------------------+



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to