This is an automated email from the ASF dual-hosted git repository. viirya pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 71991f7 [SPARK-38285][SQL] Avoid generator pruning for invalid extractor 71991f7 is described below commit 71991f75ff441e80a52cb71f66f46bfebdb05671 Author: Liang-Chi Hsieh <vii...@gmail.com> AuthorDate: Mon Mar 7 12:04:24 2022 -0800 [SPARK-38285][SQL] Avoid generator pruning for invalid extractor ### What changes were proposed in this pull request? This fixes a bug in generator nested column pruning. The bug happens when the extractor pattern is like `GetArrayStructFields(GetStructField(...), ...)` on the generator output. Once the input to the generator is an array, after replacing with the extractor based on pruning logic, it becomes an extractor of `GetArrayStructFields(GetArrayStructFields(...), ...)` which is not valid. ### Why are the changes needed? To fix a bug in generator nested column pruning. ### Does this PR introduce _any_ user-facing change? Yes, fixing a user-facing bug. ### How was this patch tested? Added unit test. Closes #35749 from viirya/SPARK-38285. Authored-by: Liang-Chi Hsieh <vii...@gmail.com> Signed-off-by: Liang-Chi Hsieh <vii...@gmail.com> --- .../catalyst/optimizer/NestedColumnAliasing.scala | 11 +++++++++++ .../scala/org/apache/spark/sql/DataFrameSuite.scala | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index c8c67f5..a2ee950 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -372,6 +372,17 @@ object GeneratorNestedColumnAliasing { e.withNewChildren(Seq(extractor)) } + // If after replacing generator expression with nested extractor, there + // is invalid extractor pattern like + // `GetArrayStructFields(GetArrayStructFields(...), ...), we cannot do + // pruning but fallback to original query plan. + val invalidExtractor = rewrittenG.generator.children.head.collect { + case GetArrayStructFields(_: GetArrayStructFields, _, _, _, _) => true + } + if (invalidExtractor.nonEmpty) { + return Some(pushedThrough) + } + // As we change the child of the generator, its output data type must be updated. val updatedGeneratorOutput = rewrittenG.generatorOutput .zip(rewrittenG.generator.elementSchema.toAttributes) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index c7d05df..3eb9764 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -3107,6 +3107,26 @@ class DataFrameSuite extends QueryTest assert(res.collect.length == 2) } + + test("SPARK-38285: Fix ClassCastException: GenericArrayData cannot be cast to InternalRow") { + withTempView("v1") { + val sqlText = + """ + |CREATE OR REPLACE TEMP VIEW v1 AS + |SELECT * FROM VALUES + |(array( + | named_struct('s', 'string1', 'b', array(named_struct('e', 'string2'))), + | named_struct('s', 'string4', 'b', array(named_struct('e', 'string5'))) + | ) + |) + |v1(o); + |""".stripMargin + sql(sqlText) + + val df = sql("SELECT eo.b.e FROM (SELECT explode(o) AS eo FROM v1)") + checkAnswer(df, Row(Seq("string2")) :: Row(Seq("string5")) :: Nil) + } + } } case class GroupByKey(a: Int, b: Int) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org