This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new 1b84e44fce7 [SPARK-40470][SQL] Handle GetArrayStructFields and GetMapValue in "arrays_zip" function 1b84e44fce7 is described below commit 1b84e44fce7da84382c4874fe3875d55c6647ddf Author: Ivan Sadikov <ivan.sadi...@databricks.com> AuthorDate: Fri Sep 16 22:05:03 2022 +0900 [SPARK-40470][SQL] Handle GetArrayStructFields and GetMapValue in "arrays_zip" function ### What changes were proposed in this pull request? This is a follow-up for https://github.com/apache/spark/pull/37833. The PR fixes column names in `arrays_zip` function for the cases when `GetArrayStructFields` and `GetMapValue` expressions are used (see unit tests for more details). Before the patch, the column names would be indexes or an AnalysisException would be thrown in the case of `GetArrayStructFields` example. ### Why are the changes needed? Fixes an inconsistency issue in Spark 3.2 and onwards where the fields would be labeled as indexes instead of column names. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? I added unit tests that reproduce the issue and confirmed that the patch fixes them. Closes #37911 from sadikovi/SPARK-40470. Authored-by: Ivan Sadikov <ivan.sadi...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 9b0f979141ba2c4124d96bc5da69ea5cac51df0d) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../expressions/collectionOperations.scala | 4 +- .../apache/spark/sql/DataFrameFunctionsSuite.scala | 45 ++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 6f66450844f..9919b31ca12 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -191,7 +191,9 @@ case class ArraysZip(children: Seq[Expression], names: Seq[Expression]) case (u: UnresolvedAttribute, _) => Literal(u.nameParts.last) case (e: NamedExpression, _) if e.resolved => Literal(e.name) case (e: NamedExpression, _) => NamePlaceholder - case (e: GetStructField, _) => Literal(e.extractFieldName) + case (g: GetStructField, _) => Literal(g.extractFieldName) + case (g: GetArrayStructFields, _) => Literal(g.field.name) + case (g: GetMapValue, _) => Literal(g.key) case (_, idx) => Literal(idx.toString) }) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 820d760bd72..8e43de53c34 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -641,6 +641,51 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { assert(fieldNames.toSeq === Seq("arr_1", "arr_2", "arr_3")) } + test("SPARK-40470: array_zip should return field names in GetArrayStructFields") { + val df = spark.read.json(Seq( + """ + { + "arr": [ + { + "obj": { + "nested": { + "field1": [1], + "field2": [2] + } + } + } + ] + } + """).toDS()) + + val res = df + .selectExpr("arrays_zip(arr.obj.nested.field1, arr.obj.nested.field2) as arr") + .select(col("arr.field1"), col("arr.field2")) + + val fieldNames = res.schema.fieldNames + assert(fieldNames.toSeq === Seq("field1", "field2")) + + checkAnswer(res, Row(Seq(Seq(1)), Seq(Seq(2))) :: Nil) + } + + test("SPARK-40470: arrays_zip should return field names in GetMapValue") { + val df = spark.sql(""" + select + map( + 'arr_1', array(1, 2), + 'arr_2', array(3, 4) + ) as map_obj + """) + + val res = df.selectExpr("arrays_zip(map_obj.arr_1, map_obj.arr_2) as arr") + + val fieldNames = res.schema.head.dataType.asInstanceOf[ArrayType] + .elementType.asInstanceOf[StructType].fieldNames + assert(fieldNames.toSeq === Seq("arr_1", "arr_2")) + + checkAnswer(res, Row(Seq(Row(1, 3), Row(2, 4)))) + } + def testSizeOfMap(sizeOfNull: Any): Unit = { val df = Seq( (Map[Int, Int](1 -> 1, 2 -> 2), "x"), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org