[ https://issues.apache.org/jira/browse/SPARK-40413?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hyukjin Kwon resolved SPARK-40413. ---------------------------------- Resolution: Invalid > Column.isin produces non-boolean results > ---------------------------------------- > > Key: SPARK-40413 > URL: https://issues.apache.org/jira/browse/SPARK-40413 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.3.0 > Reporter: Andreas Franz > Priority: Major > > I observed an inconsistent behaviour using the Column.isin function. The > [documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column] > states that an "up-cast" takes places when different data types are > involved. When working with _null_ values the results are confusing to me. > I prepared a small example demonstrating the issue > {code:java} > package example > import org.apache.spark.sql.{Row, SparkSession} > import org.apache.spark.sql.types.{StringType, StructField, StructType} > import org.apache.spark.sql.functions._ > object Test { > def main(args: Array[String]): Unit = { > val spark = SparkSession.builder() > .appName("App") > .master("local[*]") > .config("spark.driver.host", "localhost") > .config("spark.ui.enabled", "false") > .getOrCreate() > val schema = StructType( > Array( > StructField("name", StringType, nullable = true) > ) > ) > val data = Seq( > Row("a"), > Row("b"), > Row("c"), > Row(""), > Row(null) > ).toList > val list1 = Array("a", "d", "") > val list2 = Array("a", "d", "", null) > val dataFrame = > spark.createDataFrame(spark.sparkContext.parallelize(data), schema) > dataFrame > .withColumn("name_is_in_list_1", col("name").isin(list1: _*)) > .show(10, truncate = false) > /* > +----+-----------------+ > |name|name_is_in_list_1| > +----+-----------------+ > |a |true | > |b |false | > |c |false | > | |true | > |null|null | // check value null is not contained in > list1, why is null returned here? Expected result: false > +----+-----------------+ > */ > dataFrame > .withColumn("name_is_in_list_2", col("name").isin(list2: _*)) > .show(10, truncate = false) > /* > +----+-----------------+ > |name|name_is_in_list_2| > +----+-----------------+ > |a |true | > |b |null | // check value "b" is not contained in > list1, why is null returned here? Expected result: false > |c |null | // check value "c" is not contained in > list1, why is null returned here? Expected result: false > | |true | > |null|null | // check value null is in list1, why is > null returned here? Expected result: true > +----+-----------------+ > */ > val data2 = Seq( > Row("a"), > Row("b"), > Row("c"), > Row(""), > ).toList > val dataFrame2 = > spark.createDataFrame(spark.sparkContext.parallelize(data2), schema) > dataFrame2 > .withColumn("name_is_in_list_2", col("name").isin(list2: _*)) > .show(10, truncate = false) > > /* > +----+-----------------+ > |name|name_is_in_list_2| > +----+-----------------+ > |a |true | > |b |null | // check value "b" is not contained in > list2, why is null returned here? Expected result: false > |c |null | // check value "b" is not contained in > list2, why is null returned here? Expected result: false > | |true | > +----+-----------------+ > */ > } > }{code} > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org