Repository: spark Updated Branches: refs/heads/master e11d90bf8 -> 16186cdcb
[SPARK-20955][CORE] Intern "executorId" to reduce the memory usage ## What changes were proposed in this pull request? In [this line](https://github.com/apache/spark/blob/f7cf2096fdecb8edab61c8973c07c6fc877ee32d/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala#L128), it uses the `executorId` string received from executors and finally it will go into `TaskUIData`. As deserializing the `executorId` string will always create a new instance, we have a lot of duplicated string instances. This PR does a String interning for TaskUIData to reduce the memory usage. ## How was this patch tested? Manually test using `bin/spark-shell --master local-cluster[6,1,1024]`. Test codes: ``` for (_ <- 1 to 10) { sc.makeRDD(1 to 1000, 1000).count() } Thread.sleep(2000) val l = sc.getClass.getMethod("jobProgressListener").invoke(sc).asInstanceOf[org.apache.spark.ui.jobs.JobProgressListener] org.apache.spark.util.SizeEstimator.estimate(l.stageIdToData) ``` This PR reduces the size of `stageIdToData` from 3487280 to 3009744 (86.3%) in the above case. Author: Shixiong Zhu <shixi...@databricks.com> Closes #18177 from zsxwing/SPARK-20955. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16186cdc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16186cdc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16186cdc Branch: refs/heads/master Commit: 16186cdcbce1a2ec8f839c550e6b571bf5dc2692 Parents: e11d90b Author: Shixiong Zhu <shixi...@databricks.com> Authored: Fri Jun 2 10:33:21 2017 -0700 Committer: Shixiong Zhu <shixi...@databricks.com> Committed: Fri Jun 2 10:33:21 2017 -0700 ---------------------------------------------------------------------- .../main/scala/org/apache/spark/ui/jobs/UIData.scala | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/16186cdc/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala index 048c4ad..6764daa 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala @@ -20,6 +20,8 @@ package org.apache.spark.ui.jobs import scala.collection.mutable import scala.collection.mutable.{HashMap, LinkedHashMap} +import com.google.common.collect.Interners + import org.apache.spark.JobExecutionStatus import org.apache.spark.executor._ import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo} @@ -141,6 +143,14 @@ private[spark] object UIData { } object TaskUIData { + + private val stringInterner = Interners.newWeakInterner[String]() + + /** String interning to reduce the memory usage. */ + private def weakIntern(s: String): String = { + stringInterner.intern(s) + } + def apply(taskInfo: TaskInfo): TaskUIData = { new TaskUIData(dropInternalAndSQLAccumulables(taskInfo)) } @@ -155,8 +165,8 @@ private[spark] object UIData { index = taskInfo.index, attemptNumber = taskInfo.attemptNumber, launchTime = taskInfo.launchTime, - executorId = taskInfo.executorId, - host = taskInfo.host, + executorId = weakIntern(taskInfo.executorId), + host = weakIntern(taskInfo.host), taskLocality = taskInfo.taskLocality, speculative = taskInfo.speculative ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org