maropu commented on a change in pull request #31966: URL: https://github.com/apache/spark/pull/31966#discussion_r602970835
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala ########## @@ -241,12 +262,69 @@ object GeneratorNestedColumnAliasing { // On top on `Generate`, a `Project` that might have nested column accessors. // We try to get alias maps for both project list and generator's children expressions. val exprsToPrune = projectList ++ g.generator.children - NestedColumnAliasing.getAliasSubMap(exprsToPrune, g.qualifiedGeneratorOutput).map { + NestedColumnAliasing.getAliasSubMap(exprsToPrune).map { case (nestedFieldToAlias, attrToAliases) => // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. Review comment: Move this comment into L275-276? ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala ########## @@ -231,6 +231,27 @@ object NestedColumnAliasing { * of it. */ object GeneratorNestedColumnAliasing { + // Partitions `attrToAliases` based on whether the attribute is in Generator's output. + private def aliasesOnGeneratorOutput( + attrToAliases: Map[ExprId, Seq[Alias]], + generatorOutput: Seq[Attribute]) = { + val generatorOutputExprId = generatorOutput.map(_.exprId) + attrToAliases.partition { k => + generatorOutputExprId.contains(k._1) + } + } + + // Partitions `nestedFieldToAlias` based on whether the attribute of nested field extractor + // is in Generator's output. + private def nestedFieldOnGeneratorOutput( + nestedFieldToAlias: Map[ExtractValue, Alias], + generatorOutput: Seq[Attribute]) = { + val generatorOutputSet = AttributeSet(generatorOutput) + nestedFieldToAlias.partition { pair => + pair._1.references.subsetOf(generatorOutputSet) + } + } Review comment: Since these functions are used only once, how about inlining them? ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala ########## @@ -241,12 +262,69 @@ object GeneratorNestedColumnAliasing { // On top on `Generate`, a `Project` that might have nested column accessors. // We try to get alias maps for both project list and generator's children expressions. val exprsToPrune = projectList ++ g.generator.children - NestedColumnAliasing.getAliasSubMap(exprsToPrune, g.qualifiedGeneratorOutput).map { + NestedColumnAliasing.getAliasSubMap(exprsToPrune).map { case (nestedFieldToAlias, attrToAliases) => // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. - val newChild = - NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias, attrToAliases) - Project(NestedColumnAliasing.getNewProjectList(projectList, nestedFieldToAlias), newChild) + + val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) = + nestedFieldOnGeneratorOutput(nestedFieldToAlias, g.qualifiedGeneratorOutput) + val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) = + aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput) + + // Push nested column accessors through `Generator`. We cannot prune on `Generator`'s + // output. + val newChild = NestedColumnAliasing.replaceWithAliases(g, + nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator) + val pushedThrough = Project(NestedColumnAliasing + .getNewProjectList(projectList, nestedFieldsNotOnGenerator), newChild) + + // Pruning on `Generator`'s output. We only process single field case. + // For multiple field case, we cannot directly move field extractor into + // the generator expression. A workaround is to re-construct array of struct + // from multiple fields. But it will be more complicated and may not worth. + if (nestedFieldsOnGenerator.size == 1) { + // Only one nested column accessor. + // E.g., df.select(explode($"items").as("item")).select($"item.a") + pushedThrough match { + case p @ Project(_, newG: Generate) => + // Replace the child expression of `ExplodeBase` generator with + // nested column accessor. + // E.g., df.select(explode($"items").as("item")) => + // df.select(explode($"items.a").as("item")) + val rewrittenG = newG.transformExpressions { + case e: ExplodeBase => + val extractor = nestedFieldsOnGenerator.head._1.transformUp { + case _: Attribute => + e.child + case g: GetStructField => + ExtractValue(g.child, Literal(g.extractFieldName), SQLConf.get.resolver) Review comment: We need tests for case-sensitivity? ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala ########## @@ -241,12 +262,69 @@ object GeneratorNestedColumnAliasing { // On top on `Generate`, a `Project` that might have nested column accessors. // We try to get alias maps for both project list and generator's children expressions. val exprsToPrune = projectList ++ g.generator.children - NestedColumnAliasing.getAliasSubMap(exprsToPrune, g.qualifiedGeneratorOutput).map { + NestedColumnAliasing.getAliasSubMap(exprsToPrune).map { case (nestedFieldToAlias, attrToAliases) => // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. - val newChild = - NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias, attrToAliases) - Project(NestedColumnAliasing.getNewProjectList(projectList, nestedFieldToAlias), newChild) + + val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) = + nestedFieldOnGeneratorOutput(nestedFieldToAlias, g.qualifiedGeneratorOutput) + val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) = + aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput) + + // Push nested column accessors through `Generator`. We cannot prune on `Generator`'s + // output. + val newChild = NestedColumnAliasing.replaceWithAliases(g, + nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator) + val pushedThrough = Project(NestedColumnAliasing + .getNewProjectList(projectList, nestedFieldsNotOnGenerator), newChild) + + // Pruning on `Generator`'s output. We only process single field case. + // For multiple field case, we cannot directly move field extractor into + // the generator expression. A workaround is to re-construct array of struct + // from multiple fields. But it will be more complicated and may not worth. + if (nestedFieldsOnGenerator.size == 1) { Review comment: Could we file jira for this TODO task and leave the jira ID here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org