Repository: spark Updated Branches: refs/heads/master e75e340a4 -> ab4a6bfd1
[SPARK-12898] Consider having dummyCallSite for HiveTableScan Currently, HiveTableScan runs with getCallSite which is really expensive and shows up when scanning through large table with partitions (e.g TPC-DS) which slows down the overall runtime of the job. It would be good to consider having dummyCallSite in HiveTableScan. Author: Rajesh Balamohan <rbalamo...@apache.org> Closes #10825 from rajeshbalamohan/SPARK-12898. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab4a6bfd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab4a6bfd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab4a6bfd Branch: refs/heads/master Commit: ab4a6bfd11b870428eb2a96aa213f7d34c0aa622 Parents: e75e340 Author: Rajesh Balamohan <rbalamo...@apache.org> Authored: Wed Jan 20 11:30:03 2016 -0800 Committer: Reynold Xin <r...@databricks.com> Committed: Wed Jan 20 11:30:03 2016 -0800 ---------------------------------------------------------------------- .../spark/sql/hive/execution/HiveTableScan.scala | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/ab4a6bfd/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala index 1588728..eff8833 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution._ import org.apache.spark.sql.hive._ import org.apache.spark.sql.types.{BooleanType, DataType} +import org.apache.spark.util.Utils /** * The Hive table scan operator. Column and partition pruning are both handled. @@ -133,11 +134,17 @@ case class HiveTableScan( } protected override def doExecute(): RDD[InternalRow] = { + // Using dummyCallSite, as getCallSite can turn out to be expensive with + // with multiple partitions. val rdd = if (!relation.hiveQlTable.isPartitioned) { - hadoopReader.makeRDDForTable(relation.hiveQlTable) + Utils.withDummyCallSite(sqlContext.sparkContext) { + hadoopReader.makeRDDForTable(relation.hiveQlTable) + } } else { - hadoopReader.makeRDDForPartitionedTable( - prunePartitions(relation.getHiveQlPartitions(partitionPruningPred))) + Utils.withDummyCallSite(sqlContext.sparkContext) { + hadoopReader.makeRDDForPartitionedTable( + prunePartitions(relation.getHiveQlPartitions(partitionPruningPred))) + } } rdd.mapPartitionsInternal { iter => val proj = UnsafeProjection.create(schema) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org