Repository: spark Updated Branches: refs/heads/branch-2.0 bcf0c51b6 -> 4018c4800
[SPARK-15742][SQL] Reduce temp collections allocations in TreeNode transform methods In Catalyst's TreeNode transform methods we end up calling `productIterator.map(...).toArray` in a number of places, which is slightly inefficient because it needs to allocate an `ArrayBuilder` and grow a temporary array. Since we already know the size of the final output (`productArity`), we can simply allocate an array up-front and use a while loop to consume the iterator and populate the array. For most workloads, this performance difference is negligible but it does make a measurable difference in optimizer performance for queries that operate over very wide schemas (such as the benchmark queries in #13456). ### Perf results (from #13456 benchmarks) **Before** ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_66-b17 on Mac OS X 10.10.5 Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz parsing large select: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ 1 select expressions 19 / 22 0.0 19119858.0 1.0X 10 select expressions 23 / 25 0.0 23208774.0 0.8X 100 select expressions 55 / 73 0.0 54768402.0 0.3X 1000 select expressions 229 / 259 0.0 228606373.0 0.1X 2500 select expressions 530 / 554 0.0 529938178.0 0.0X ``` **After** ``` parsing large select: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ 1 select expressions 15 / 21 0.0 14978203.0 1.0X 10 select expressions 22 / 27 0.0 22492262.0 0.7X 100 select expressions 48 / 64 0.0 48449834.0 0.3X 1000 select expressions 189 / 208 0.0 189346428.0 0.1X 2500 select expressions 429 / 449 0.0 428943897.0 0.0X ``` ### Author: Josh Rosen <joshro...@databricks.com> Closes #13484 from JoshRosen/treenode-productiterator-map. (cherry picked from commit e526913989d6099064886ea3ed3f6a2a0376a4f8) Signed-off-by: Josh Rosen <joshro...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4018c480 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4018c480 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4018c480 Branch: refs/heads/branch-2.0 Commit: 4018c48009ab479b996ba0ec0a3d0a47c97388ed Parents: bcf0c51 Author: Josh Rosen <joshro...@databricks.com> Authored: Fri Jun 3 13:53:02 2016 -0700 Committer: Josh Rosen <joshro...@databricks.com> Committed: Fri Jun 3 13:53:49 2016 -0700 ---------------------------------------------------------------------- .../spark/sql/catalyst/plans/QueryPlan.scala | 6 ++--- .../spark/sql/catalyst/trees/TreeNode.scala | 26 +++++++++++++++----- 2 files changed, 23 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/4018c480/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index 6784c3a..3de15a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -172,7 +172,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT case null => null } - val newArgs = productIterator.map(recursiveTransform).toArray + val newArgs = mapProductIterator(recursiveTransform) if (changed) makeCopy(newArgs).asInstanceOf[this.type] else this } @@ -206,7 +206,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT case null => null } - val newArgs = productIterator.map(recursiveTransform).toArray + val newArgs = mapProductIterator(recursiveTransform) if (changed) makeCopy(newArgs).asInstanceOf[this.type] else this } @@ -318,7 +318,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT case other => other } - productIterator.map { + mapProductIterator { case s: Option[_] => s.map(cleanArg) case s: Seq[_] => s.map(cleanArg) case m: Map[_, _] => m.mapValues(cleanArg) http://git-wip-us.apache.org/repos/asf/spark/blob/4018c480/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 50481cd..22d82c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -21,6 +21,7 @@ import java.util.UUID import scala.collection.Map import scala.collection.mutable.Stack +import scala.reflect.ClassTag import org.apache.commons.lang.ClassUtils import org.json4s.JsonAST._ @@ -169,11 +170,24 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { } /** + * Efficient alternative to `productIterator.map(f).toArray`. + */ + protected def mapProductIterator[B: ClassTag](f: Any => B): Array[B] = { + val arr = Array.ofDim[B](productArity) + var i = 0 + while (i < arr.length) { + arr(i) = f(productElement(i)) + i += 1 + } + arr + } + + /** * Returns a copy of this node where `f` has been applied to all the nodes children. */ def mapChildren(f: BaseType => BaseType): BaseType = { var changed = false - val newArgs = productIterator.map { + val newArgs = mapProductIterator { case arg: TreeNode[_] if containsChild(arg) => val newChild = f(arg.asInstanceOf[BaseType]) if (newChild fastEquals arg) { @@ -184,7 +198,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { } case nonChild: AnyRef => nonChild case null => null - }.toArray + } if (changed) makeCopy(newArgs) else this } @@ -197,7 +211,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { var changed = false val remainingNewChildren = newChildren.toBuffer val remainingOldChildren = children.toBuffer - val newArgs = productIterator.map { + val newArgs = mapProductIterator { case s: StructType => s // Don't convert struct types to some other type of Seq[StructField] // Handle Seq[TreeNode] in TreeNode parameters. case s: Seq[_] => s.map { @@ -237,7 +251,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { } case nonChild: AnyRef => nonChild case null => null - }.toArray + } if (changed) makeCopy(newArgs) else this } @@ -302,7 +316,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { rule: PartialFunction[BaseType, BaseType], nextOperation: (BaseType, PartialFunction[BaseType, BaseType]) => BaseType): BaseType = { var changed = false - val newArgs = productIterator.map { + val newArgs = mapProductIterator { case arg: TreeNode[_] if containsChild(arg) => val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) if (!(newChild fastEquals arg)) { @@ -353,7 +367,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { } case nonChild: AnyRef => nonChild case null => null - }.toArray + } if (changed) makeCopy(newArgs) else this } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org