Andreas Franz created SPARK-40413: ------------------------------------- Summary: Column.isin produces non-boolean results Key: SPARK-40413 URL: https://issues.apache.org/jira/browse/SPARK-40413 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 3.3.0 Reporter: Andreas Franz
I observed an inconsistent behaviour using the Column.isin function. The [documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column] states that an "up-cast" takes places when different data types are involved. When working with _null_ values the results are confusing to me. I prepared a small example demonstrating the issue {code:java} package example import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.functions._ object Test { def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("App") .master("local[*]") .config("spark.driver.host", "localhost") .config("spark.ui.enabled", "false") .getOrCreate() val schema = StructType( Array( StructField("name", StringType, nullable = true) ) ) val data = Seq( Row("a"), Row("b"), Row("c"), Row(""), Row(null) ).toList val list1 = Array("a", "d", "") val list2 = Array("a", "d", "", null) val dataFrame = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) dataFrame .withColumn("name_is_in_list_1", col("name").isin(list1: _*)) .show(10, truncate = false) /* +----+-----------------+ |name|name_is_in_list_1| +----+-----------------+ |a |true | |b |false | |c |false | | |true | |null|null | // check value null is not contained in list1, why is null returned here? Expected result: false +----+-----------------+ */ dataFrame .withColumn("name_is_in_list_2", col("name").isin(list2: _*)) .show(10, truncate = false) /* +----+-----------------+ |name|name_is_in_list_2| +----+-----------------+ |a |true | |b |null | // check value "b" is not contained in list1, why is null returned here? Expected result: false |c |null | // check value "c" is not contained in list1, why is null returned here? Expected result: false | |true | |null|null | // check value null is in list1, why is null returned here? Expected result: true +----+-----------------+ */ val data2 = Seq( Row("a"), Row("b"), Row("c"), Row(""), ).toList val dataFrame2 = spark.createDataFrame(spark.sparkContext.parallelize(data2), schema) dataFrame2 .withColumn("name_is_in_list_2", col("name").isin(list2: _*)) .show(10, truncate = false) /* +----+-----------------+ |name|name_is_in_list_2| +----+-----------------+ |a |true | |b |null | // check value "b" is not contained in list1, why is null returned here? Expected result: false |c |null | // check value "b" is not contained in list1, why is null returned here? Expected result: false | |true | +----+-----------------+ */ } }{code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org