Andreas Franz created SPARK-40413:
-------------------------------------

             Summary: Column.isin produces non-boolean results
                 Key: SPARK-40413
                 URL: https://issues.apache.org/jira/browse/SPARK-40413
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 3.3.0
            Reporter: Andreas Franz


I observed an inconsistent behaviour using the Column.isin function. The 
[documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column]
 states that an "up-cast" takes places when different data types are involved. 
When working with _null_ values the results are confusing to me.

I prepared a small example demonstrating the issue
{code:java}
package example

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.functions._

object Test {


    def main(args: Array[String]): Unit = {

        val spark = SparkSession.builder()
            .appName("App")
            .master("local[*]")
            .config("spark.driver.host", "localhost")
            .config("spark.ui.enabled", "false")
            .getOrCreate()

        val schema = StructType(
            Array(
                StructField("name", StringType, nullable = true)
            )
        )

        val data = Seq(
            Row("a"),
            Row("b"),
            Row("c"),
            Row(""),
            Row(null)
        ).toList

        val list1 = Array("a", "d", "")
        val list2 = Array("a", "d", "", null)

        val dataFrame = 
spark.createDataFrame(spark.sparkContext.parallelize(data), schema)

        dataFrame
            .withColumn("name_is_in_list_1", col("name").isin(list1: _*))
            .show(10, truncate = false)

        /*
            +----+-----------------+
            |name|name_is_in_list_1|
            +----+-----------------+
            |a   |true             |
            |b   |false            |
            |c   |false            |
            |    |true             |
            |null|null             | // check value null is not contained in 
list1, why is null returned here? Expected result: false
            +----+-----------------+
         */

        dataFrame
            .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
            .show(10, truncate = false)

        /*
            +----+-----------------+
            |name|name_is_in_list_2|
            +----+-----------------+
            |a   |true             |
            |b   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |c   |null             | // check value "c" is not contained in 
list1, why is null returned here? Expected result: false
            |    |true             |
            |null|null             | // check value null is in list1, why is 
null returned here? Expected result: true
            +----+-----------------+
         */


        val data2 = Seq(
            Row("a"),
            Row("b"),
            Row("c"),
            Row(""),
        ).toList

        val dataFrame2 = 
spark.createDataFrame(spark.sparkContext.parallelize(data2), schema)

        dataFrame2
            .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
            .show(10, truncate = false)
        
        /*
            +----+-----------------+
            |name|name_is_in_list_2|
            +----+-----------------+
            |a   |true             |
            |b   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |c   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |    |true             |
            +----+-----------------+
         */
    }
}{code}
 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to