This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new a71aed977f5 [SPARK-42401][SQL] Set `containsNull` correctly in the data type for array_insert/array_append a71aed977f5 is described below commit a71aed977f5006ad271f947a6ae3cdd38349ed8e Author: Bruce Robbins <bersprock...@gmail.com> AuthorDate: Mon Feb 13 15:49:46 2023 +0900 [SPARK-42401][SQL] Set `containsNull` correctly in the data type for array_insert/array_append ### What changes were proposed in this pull request? In the `DataType` instance returned by `ArrayInsert#dataType` and `ArrayAppend#dataType`, set `containsNull` to true if either - the input array has `containsNull` set to true - the expression to be inserted/appended is nullable. ### Why are the changes needed? The following two queries return the wrong answer: ``` spark-sql> select array_insert(array(1, 2, 3, 4), 5, cast(null as int)); [1,2,3,4,0] <== should be [1,2,3,4,null] Time taken: 3.879 seconds, Fetched 1 row(s) spark-sql> select array_append(array(1, 2, 3, 4), cast(null as int)); [1,2,3,4,0] <== should be [1,2,3,4,null] Time taken: 0.068 seconds, Fetched 1 row(s) spark-sql> ``` The following two queries throw a `NullPointerException`: ``` spark-sql> select array_insert(array('1', '2', '3', '4'), 5, cast(null as string)); 23/02/10 11:24:59 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2) java.lang.NullPointerException at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) ... spark-sql> select array_append(array('1', '2', '3', '4'), cast(null as string)); 23/02/10 11:25:10 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 3) java.lang.NullPointerException at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) ... spark-sql> ``` The bug arises because both `ArrayInsert` and `ArrayAppend` use the first child's data type as the function's data type. That is, it uses the first child's `containsNull` setting, regardless of whether the insert/append operation might produce an array containing a null value. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests. Closes #39970 from bersprockets/array_insert_anomaly. Authored-by: Bruce Robbins <bersprock...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 718b6b7ed8277d5f6577367ab0d49f60f9777df7) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../sql/catalyst/expressions/collectionOperations.scala | 4 ++-- .../catalyst/expressions/CollectionExpressionsSuite.scala | 14 ++++++++++++++ .../org/apache/spark/sql/DataFrameFunctionsSuite.scala | 14 ++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 92a3127d438..53d8ff160c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -4840,7 +4840,7 @@ case class ArrayInsert(srcArrayExpr: Expression, posExpr: Expression, itemExpr: override def third: Expression = itemExpr override def prettyName: String = "array_insert" - override def dataType: DataType = first.dataType + override def dataType: DataType = if (third.nullable) first.dataType.asNullable else first.dataType override def nullable: Boolean = first.nullable | second.nullable @transient private lazy val elementType: DataType = @@ -5024,7 +5024,7 @@ case class ArrayAppend(left: Expression, right: Expression) * Returns the [[DataType]] of the result of evaluating this expression. It is invalid to query * the dataType of an unresolved expression (i.e., when `resolved` == false). */ - override def dataType: DataType = left.dataType + override def dataType: DataType = if (right.nullable) left.dataType.asNullable else left.dataType protected def withNewChildrenInternal(newLeft: Expression, newRight: Expression): ArrayAppend = copy(left = newLeft, right = newRight) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala index 5917d84df1e..64b9c18605d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala @@ -2752,4 +2752,18 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper ) } + + test("SPARK-42401: Array insert of null value") { + val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false)) + checkEvaluation(ArrayInsert( + a, Literal(2), Literal.create(null, StringType)), Seq("b", null, "a", "c") + ) + } + + test("SPARK-42401: Array append of null value") { + val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false)) + checkEvaluation(ArrayAppend( + a, Literal.create(null, StringType)), Seq("b", "a", "c", null) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 6ed8299976c..94f813a2c6b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -5431,6 +5431,20 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Seq(Row(Seq(1, 2, 3, null, null))) ) } + + test("SPARK-42401: array_insert - insert null") { + checkAnswer( + sql("select array_insert(array('b', 'a', 'c'), 2, cast(null as string))"), + Seq(Row(Seq("b", null, "a", "c"))) + ) + } + + test("SPARK-42401: array_append - append null") { + checkAnswer( + sql("select array_append(array('b', 'a', 'c'), cast(null as string))"), + Seq(Row(Seq("b", "a", "c", null))) + ) + } } object DataFrameFunctionsSuite { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org