[GitHub] [spark] maropu commented on a change in pull request #31966: [SPARK-34638][SQL] Single field nested column prune on generator output

GitBox Sun, 28 Mar 2021 18:49:55 -0700


maropu commented on a change in pull request #31966:
URL: https://github.com/apache/spark/pull/31966#discussion_r602970835




##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,69 @@ object GeneratorNestedColumnAliasing {
       // On top on `Generate`, a `Project` that might have nested column 
accessors.
       // We try to get alias maps for both project list and generator's 
children expressions.
       val exprsToPrune = projectList ++ g.generator.children
-      NestedColumnAliasing.getAliasSubMap(exprsToPrune, 
g.qualifiedGeneratorOutput).map {
+      NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
         case (nestedFieldToAlias, attrToAliases) =>
           // Defer updating `Generate.unrequiredChildIndex` to next round of 
`ColumnPruning`.

Review comment:
       Move this comment into L275-276?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -231,6 +231,27 @@ object NestedColumnAliasing {
  * of it.
  */
 object GeneratorNestedColumnAliasing {
+  // Partitions `attrToAliases` based on whether the attribute is in 
Generator's output.
+  private def aliasesOnGeneratorOutput(
+      attrToAliases: Map[ExprId, Seq[Alias]],
+      generatorOutput: Seq[Attribute]) = {
+    val generatorOutputExprId = generatorOutput.map(_.exprId)
+    attrToAliases.partition { k =>
+      generatorOutputExprId.contains(k._1)
+    }
+  }
+
+  // Partitions `nestedFieldToAlias` based on whether the attribute of nested 
field extractor
+  // is in Generator's output.
+  private def nestedFieldOnGeneratorOutput(
+      nestedFieldToAlias: Map[ExtractValue, Alias],
+      generatorOutput: Seq[Attribute]) = {
+    val generatorOutputSet = AttributeSet(generatorOutput)
+    nestedFieldToAlias.partition { pair =>
+      pair._1.references.subsetOf(generatorOutputSet)
+    }
+  }

Review comment:
       Since these functions are used only once, how about inlining them?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,69 @@ object GeneratorNestedColumnAliasing {
       // On top on `Generate`, a `Project` that might have nested column 
accessors.
       // We try to get alias maps for both project list and generator's 
children expressions.
       val exprsToPrune = projectList ++ g.generator.children
-      NestedColumnAliasing.getAliasSubMap(exprsToPrune, 
g.qualifiedGeneratorOutput).map {
+      NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
         case (nestedFieldToAlias, attrToAliases) =>
           // Defer updating `Generate.unrequiredChildIndex` to next round of 
`ColumnPruning`.
-          val newChild =
-            NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias, 
attrToAliases)
-          Project(NestedColumnAliasing.getNewProjectList(projectList, 
nestedFieldToAlias), newChild)
+
+          val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) =
+            nestedFieldOnGeneratorOutput(nestedFieldToAlias, 
g.qualifiedGeneratorOutput)
+          val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) =
+            aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput)
+
+          // Push nested column accessors through `Generator`. We cannot prune 
on `Generator`'s
+          // output.
+          val newChild = NestedColumnAliasing.replaceWithAliases(g,
+            nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator)
+          val pushedThrough = Project(NestedColumnAliasing
+            .getNewProjectList(projectList, nestedFieldsNotOnGenerator), 
newChild)
+
+          // Pruning on `Generator`'s output. We only process single field 
case.
+          // For multiple field case, we cannot directly move field extractor 
into
+          // the generator expression. A workaround is to re-construct array 
of struct
+          // from multiple fields. But it will be more complicated and may not 
worth.
+          if (nestedFieldsOnGenerator.size == 1) {
+            // Only one nested column accessor.
+            // E.g., df.select(explode($"items").as("item")).select($"item.a")
+            pushedThrough match {
+              case p @ Project(_, newG: Generate) =>
+                // Replace the child expression of `ExplodeBase` generator with
+                // nested column accessor.
+                // E.g., df.select(explode($"items").as("item")) =>
+                //       df.select(explode($"items.a").as("item"))
+                val rewrittenG = newG.transformExpressions {
+                  case e: ExplodeBase =>
+                    val extractor = 
nestedFieldsOnGenerator.head._1.transformUp {
+                      case _: Attribute =>
+                        e.child
+                      case g: GetStructField =>
+                        ExtractValue(g.child, Literal(g.extractFieldName), 
SQLConf.get.resolver)

Review comment:
       We need tests for case-sensitivity?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,69 @@ object GeneratorNestedColumnAliasing {
       // On top on `Generate`, a `Project` that might have nested column 
accessors.
       // We try to get alias maps for both project list and generator's 
children expressions.
       val exprsToPrune = projectList ++ g.generator.children
-      NestedColumnAliasing.getAliasSubMap(exprsToPrune, 
g.qualifiedGeneratorOutput).map {
+      NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
         case (nestedFieldToAlias, attrToAliases) =>
           // Defer updating `Generate.unrequiredChildIndex` to next round of 
`ColumnPruning`.
-          val newChild =
-            NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias, 
attrToAliases)
-          Project(NestedColumnAliasing.getNewProjectList(projectList, 
nestedFieldToAlias), newChild)
+
+          val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) =
+            nestedFieldOnGeneratorOutput(nestedFieldToAlias, 
g.qualifiedGeneratorOutput)
+          val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) =
+            aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput)
+
+          // Push nested column accessors through `Generator`. We cannot prune 
on `Generator`'s
+          // output.
+          val newChild = NestedColumnAliasing.replaceWithAliases(g,
+            nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator)
+          val pushedThrough = Project(NestedColumnAliasing
+            .getNewProjectList(projectList, nestedFieldsNotOnGenerator), 
newChild)
+
+          // Pruning on `Generator`'s output. We only process single field 
case.
+          // For multiple field case, we cannot directly move field extractor 
into
+          // the generator expression. A workaround is to re-construct array 
of struct
+          // from multiple fields. But it will be more complicated and may not 
worth.
+          if (nestedFieldsOnGenerator.size == 1) {

Review comment:
       Could we file jira for this TODO task and leave the jira ID here?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] maropu commented on a change in pull request #31966: [SPARK-34638][SQL] Single field nested column prune on generator output

Reply via email to