[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/120#discussion_r10553691 --- Diff: yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala --- @@ -133,11 +148,11 @@ class ClientArguments(val args: Array[String], val sparkConf: SparkConf) { --class CLASS_NAME Name of your application's main class (required)\n + --args ARGSArguments to be passed to your application's main class.\n + Mutliple invocations are possible, each will be passed in order.\n + ---num-workers NUM Number of workers to start (Default: 2)\n + ---worker-cores NUM Number of cores for the workers (Default: 1).\n + ---master-class CLASS_NAME Class Name for Master (Default: spark.deploy.yarn.ApplicationMaster)\n + ---master-memory MEMMemory for Master (e.g. 1000M, 2G) (Default: 512 Mb)\n + ---worker-memory MEMMemory per Worker (e.g. 1000M, 2G) (Default: 1G)\n + +--num-executors NUMNumber of executors to start (Default: 2)\n + +--executor-cores NUM Number of cores for the executors (Default: 1).\n + +--am-class CLASS_NAME Class Name for application master (Default: spark.deploy.yarn.ApplicationMaster)\n + --- End diff -- On this one I'm still a bit confused. @tgravescs what was the reason for making this configurable? Are there other ApplicationMaster implementations that could be used here? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1126. spark-app preliminary
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/86#discussion_r10553709 --- Diff: core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala --- @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy + +import scala.collection.mutable.ArrayBuffer + +/** + * Parses and encapsulates arguments from the spark-submit script. + */ +private[spark] class SparkSubmitArguments(args: Array[String]) { + var master: String = null + var deployMode: String = null + var executorMemory: String = null + var executorCores: String = null + var totalExecutorCores: String = null + var driverMemory: String = null + var driverCores: String = null + var supervise: Boolean = false + var queue: String = null + var numExecutors: String = null + var files: String = null + var archives: String = null + var mainClass: String = null + var primaryResource: String = null + var name: String = null + var childArgs: ArrayBuffer[String] = new ArrayBuffer[String]() + var moreJars: String = null + + loadEnvVars() + parseArgs(args.toList) + + def loadEnvVars() { +master = System.getenv(MASTER) +deployMode = System.getenv(DEPLOY_MODE) + } + +// TODO: some deploy mode and master need to be specified by default. local seems like a good choice. or we should just error? +// TODO: throw an exception instead of exiting to make things easier on tests? + + def parseArgs(args: List[String]) { +if (args.size == 0) { + printUsageAndExit(1) + System.exit(1) +} +primaryResource = args(0) +parseOpts(args.tail) + } + + def parseOpts(opts: List[String]): Unit = opts match { +case (--name) :: value :: tail = + name = value + parseOpts(tail) + +case (--master) :: value :: tail = + master = value + parseOpts(tail) + +case (--class) :: value :: tail = + mainClass = value + parseOpts(tail) + +case (--deploy-mode) :: value :: tail = + if (value != client value != cluster) { +System.err.println(--deploy-mode must be either \client\ or \cluster\) +System.exit(1) + } + deployMode = value + parseOpts(tail) + +case (--num-executors) :: value :: tail = + numExecutors = value + parseOpts(tail) + +case (--total-executor-cores) :: value :: tail = + totalExecutorCores = value + parseOpts(tail) + +case (--executor-cores) :: value :: tail = + executorCores = value + parseOpts(tail) + +case (--executor-memory) :: value :: tail = + executorMemory = value + parseOpts(tail) + +case (--driver-memory) :: value :: tail = + driverMemory = value + parseOpts(tail) + +case (--driver-cores) :: value :: tail = + driverCores = value + parseOpts(tail) + +case (--supervise) :: tail = + supervise = true + parseOpts(tail) + +case (--queue) :: value :: tail = + queue = value + parseOpts(tail) + +case (--files) :: value :: tail = + files = value + parseOpts(tail) + +case (--archives) :: value :: tail = + archives = value + parseOpts(tail) + +case (--arg) :: value :: tail = + childArgs += value + parseOpts(tail) + +case (--more-jars) :: value :: tail = + moreJars = value + parseOpts(tail) + +case (--help | -h) :: tail = + printUsageAndExit(0) + +case Nil = + +case _ = + printUsageAndExit(1, opts) + } + + def printUsageAndExit(exitCode: Int, unknownParam: Any = null) { +if (unknownParam != null) { +
[GitHub] spark pull request: SPARK-1126. spark-app preliminary
Github user pwendell commented on the pull request: https://github.com/apache/spark/pull/86#issuecomment-37502813 @mateiz maybe you could take a pass on this? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/120#discussion_r10553724 --- Diff: yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala --- @@ -133,11 +148,11 @@ class ClientArguments(val args: Array[String], val sparkConf: SparkConf) { --class CLASS_NAME Name of your application's main class (required)\n + --args ARGSArguments to be passed to your application's main class.\n + Mutliple invocations are possible, each will be passed in order.\n + ---num-workers NUM Number of workers to start (Default: 2)\n + ---worker-cores NUM Number of cores for the workers (Default: 1).\n + ---master-class CLASS_NAME Class Name for Master (Default: spark.deploy.yarn.ApplicationMaster)\n + ---master-memory MEMMemory for Master (e.g. 1000M, 2G) (Default: 512 Mb)\n + ---worker-memory MEMMemory per Worker (e.g. 1000M, 2G) (Default: 1G)\n + +--num-executors NUMNumber of executors to start (Default: 2)\n + +--executor-cores NUM Number of cores for the executors (Default: 1).\n + +--am-class CLASS_NAME Class Name for application master (Default: spark.deploy.yarn.ApplicationMaster)\n + --- End diff -- Ah I guess it's totally orthognal to this pr - but nonetheless curious. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/120#discussion_r10553741 --- Diff: yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala --- @@ -67,24 +67,39 @@ class ClientArguments(val args: Array[String], val sparkConf: SparkConf) { userArgsBuffer += value args = tail -case (--master-class) :: value :: tail = +case (--master-class | --am-class) :: value :: tail = --- End diff -- IMO it's fine to ignore this. I'd consider it the same as someone giving the same flag twice, in which case it will overwrite one value with another --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user pwendell commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37503046 Sandy - looks good to me. Are you still changing things? I noticed there are a few comments that maybe should be updated: ``` alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala: // Used to generate a unique id per worker stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala: // doAs in order for the credentials to be passed on to the worker containers. ``` @tgravescs any other feedback? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1236 - Upgrade Jetty to 9.1.3.v20140225.
Github user pwendell commented on the pull request: https://github.com/apache/spark/pull/113#issuecomment-37503197 @andrewor14 any comments or reservations on this one? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user sryza commented on a diff in the pull request: https://github.com/apache/spark/pull/120#discussion_r10554025 --- Diff: yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala --- @@ -133,11 +148,11 @@ class ClientArguments(val args: Array[String], val sparkConf: SparkConf) { --class CLASS_NAME Name of your application's main class (required)\n + --args ARGSArguments to be passed to your application's main class.\n + Mutliple invocations are possible, each will be passed in order.\n + ---num-workers NUM Number of workers to start (Default: 2)\n + ---worker-cores NUM Number of cores for the workers (Default: 1).\n + ---master-class CLASS_NAME Class Name for Master (Default: spark.deploy.yarn.ApplicationMaster)\n + ---master-memory MEMMemory for Master (e.g. 1000M, 2G) (Default: 512 Mb)\n + ---worker-memory MEMMemory per Worker (e.g. 1000M, 2G) (Default: 1G)\n + +--num-executors NUMNumber of executors to start (Default: 2)\n + +--executor-cores NUM Number of cores for the executors (Default: 1).\n + +--am-class CLASS_NAME Class Name for application master (Default: spark.deploy.yarn.ApplicationMaster)\n + --- End diff -- The application master used for yarn-client mode is different from the application master used for standalone mode (the latter runs the driver and the former is only used to request resources). They both use org.apache.spark.deploy.yarn.Client to submit the app to YARN, but each uses a different AM class. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1019: pyspark RDD take() throws an NPE
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/112 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37504134 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13150/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37504125 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13152/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37504124 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: MLI-1 Decision Trees
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/79#issuecomment-37504122 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37504133 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37504132 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13151/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Spark 1162 Implemented takeOrdered in pyspark.
Github user mateiz commented on the pull request: https://github.com/apache/spark/pull/97#issuecomment-37505082 BTW as mentioned above please use PriorityQueue here instead of copying their heap. It's just a lot of work to copy the heap.. we can take the performance hit instead. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1198] Allow pipes tasks to run in diffe...
Github user mateiz commented on a diff in the pull request: https://github.com/apache/spark/pull/128#discussion_r10554415 --- Diff: core/pom.xml --- @@ -184,13 +184,12 @@ artifactIdmetrics-graphite/artifactId /dependency dependency - groupIdorg.apache.derby/groupId - artifactIdderby/artifactId - scopetest/scope -/dependency -dependency groupIdcommons-io/groupId artifactIdcommons-io/artifactId +/dependency +dependency + groupIdorg.apache.derby/groupId + artifactIdderby/artifactId --- End diff -- Is this change related to the PR? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [Spark-1234] clean up text in running-on-yarn....
Github user mateiz commented on the pull request: https://github.com/apache/spark/pull/130#issuecomment-37505269 Jenkins, add to whitelist and test this please --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Spark 1162 Implemented takeOrdered in pyspark.
Github user ScrapCodes commented on the pull request: https://github.com/apache/spark/pull/97#issuecomment-37505304 PriorityQueue in a way is just a wrapper over heapq and allows for blocking for put and get(AFAIU). We would need maxheapq to retain the top N smallest elements. One other thing we can do instead of copying heapq is that we write one of our own in a nice extensible way, which allows one to plugin a comparator. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user MLnick commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37506200 @mengxr nice work! Thanks for this. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Spark 1162 Implemented takeOrdered in pyspark.
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/97#issuecomment-37506805 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [Spark-1234] clean up text in running-on-yarn....
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/130#issuecomment-37506800 Build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1126. spark-app preliminary
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/86#issuecomment-37506962 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1126. spark-app preliminary
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/86#issuecomment-37507039 One or more automated tests failed Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13155/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1126. spark-app preliminary
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/86#issuecomment-37507037 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user rxin commented on the pull request: https://github.com/apache/spark/pull/131#issuecomment-37507055 Thanks. I've merged this. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Spark 1162 Implemented takeOrdered in pyspark.
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/97#issuecomment-37509934 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [Spark-1234] clean up text in running-on-yarn....
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/130#issuecomment-37509933 Build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Spark 1162 Implemented takeOrdered in pyspark.
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/97#issuecomment-37509935 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13154/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1096, a space after comment start style ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/124#issuecomment-37510206 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1237, 1238] Improve the computation of ...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/131 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1096, a space after comment start style ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/124#issuecomment-37514176 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13156/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1096, a space after comment start style ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/124#issuecomment-37514175 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1096, a space after comment start style ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/124#issuecomment-37514378 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: MetadataCleaner - fine control cleanup documen...
Github user mridulm commented on a diff in the pull request: https://github.com/apache/spark/pull/89#discussion_r10559136 --- Diff: docs/configuration.md --- @@ -487,6 +477,88 @@ Apart from these, the following properties are also available, and may be useful /tr /table + +The following are the properties that can be used to schedule cleanup jobs at different levels. +The below mentioned metadata tuning parameters should be set with a lot of consideration and only where required. +Scheduling metadata cleaning in the middle of job can result in a lot of unnecessary re-computations. + +table class=table +trthProperty Name/ththDefault/ththMeaning/th/tr +tr + tdspark.cleaner.ttl/td + td(infinite)/td + td +Duration (seconds) of how long Spark will remember any metadata (stages generated, tasks generated, etc.). +Periodic cleanups will ensure that metadata older than this duration will be forgetten. This is +useful for running Spark for many hours / days (for example, running 24/7 in case of Spark Streaming +applications). Note that any RDD that persists in memory for more than this duration will be cleared as well. + /td +/tr +tr + tdspark.cleaner.ttl.MAP_OUTPUT_TRACKER/td + tdspark.cleaner.ttl, with a min. value of 10 secs/td + td +Cleans up the map containing the information of the mapper (the input block manager Id and the output result size) corresponding to a shuffle Id. + /td --- End diff -- you might want to add that this takes precedence over spark.cleaner.ttl --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: MetadataCleaner - fine control cleanup documen...
Github user mridulm commented on a diff in the pull request: https://github.com/apache/spark/pull/89#discussion_r10559142 --- Diff: docs/configuration.md --- @@ -487,6 +477,88 @@ Apart from these, the following properties are also available, and may be useful /tr /table + +The following are the properties that can be used to schedule cleanup jobs at different levels. +The below mentioned metadata tuning parameters should be set with a lot of consideration and only where required. +Scheduling metadata cleaning in the middle of job can result in a lot of unnecessary re-computations. + +table class=table +trthProperty Name/ththDefault/ththMeaning/th/tr +tr + tdspark.cleaner.ttl/td + td(infinite)/td + td +Duration (seconds) of how long Spark will remember any metadata (stages generated, tasks generated, etc.). +Periodic cleanups will ensure that metadata older than this duration will be forgetten. This is +useful for running Spark for many hours / days (for example, running 24/7 in case of Spark Streaming +applications). Note that any RDD that persists in memory for more than this duration will be cleared as well. + /td +/tr +tr + tdspark.cleaner.ttl.MAP_OUTPUT_TRACKER/td + tdspark.cleaner.ttl, with a min. value of 10 secs/td + td +Cleans up the map containing the information of the mapper (the input block manager Id and the output result size) corresponding to a shuffle Id. + /td --- End diff -- same for rest ... --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1096, a space after comment start style ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/124#issuecomment-37518903 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13157/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37518963 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Add unit test to spark_ec2 script
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/134#issuecomment-37518956 Can one of the admins verify this patch? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37518962 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37523239 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13158/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37527761 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Prevent ContextClassLoader of Actor from becom...
Github user ScrapCodes commented on the pull request: https://github.com/apache/spark/pull/15#issuecomment-37529958 Thanks for the fix. Only for the record this happens only when MASTER=local or local[2]. Looks good. It might be good to add above test case in ReplSuite though. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Prevent ContextClassLoader of Actor from becom...
Github user ScrapCodes commented on the pull request: https://github.com/apache/spark/pull/15#issuecomment-37530227 Mind changing the PR title to add Jira ID? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Spark 615 map partitions with index callable f...
Github user ScrapCodes commented on the pull request: https://github.com/apache/spark/pull/16#issuecomment-37531010 It might be good to add this test in java8 API suite ? Not sure if its 100% necessary, but there exist one for all other APIs (I hope!!). Thoughts ? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37533206 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37533178 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37540002 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/101#issuecomment-37540073 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1144 Added license and RAT to check lice...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/125#issuecomment-37540004 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13160/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user tgravescs commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37543156 Looks good to me. I made the small comment about perhaps leaving the --am-class out of the usage but I'm ok either way. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/101#issuecomment-37547666 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/101#issuecomment-37547667 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13161/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: MetadataCleaner - fine control cleanup documen...
Github user andrewor14 commented on the pull request: https://github.com/apache/spark/pull/89#issuecomment-37554932 @mridulm and @puravaggarwal Thanks for doing this. As a general direction, however, we are actually moving away from MetadataCleaner in the long run - PR#126 is the first step, which cleans up state explicitly once it is out of scope of the application. Periodically cleaning up is really a somewhat hacky mechanism because surprisingly often we end up removing what the application actually still needs, especially in streaming. Therefore, I am not sure if we actually want to expose these fields to the user, since doing so encourages them to use this feature that is not technically correct and intended to be deprecated in the future. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1236 - Upgrade Jetty to 9.1.3.v20140225.
Github user andrewor14 commented on the pull request: https://github.com/apache/spark/pull/113#issuecomment-37555992 LGTM --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user aarondav commented on a diff in the pull request: https://github.com/apache/spark/pull/101#discussion_r10576154 --- Diff: core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala --- @@ -222,4 +232,19 @@ private[spark] object HadoopRDD { def putCachedMetadata(key: String, value: Any) = SparkEnv.get.hadoopJobMetadata.put(key, value) + + def addLocalConfiguration(jobTrackerId: String, jobId: Int, splitId: Int, attemptId: Int, +conf: JobConf) { +// generate job id +//val stageId = context.stageId --- End diff -- here too --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10576181 --- Diff: core/src/main/scala/org/apache/spark/MapOutputTracker.scala --- @@ -149,14 +151,9 @@ private[spark] class MapOutputTracker(conf: SparkConf) extends Logging { } } --- End diff -- Not exactly related to your patch, but does the MOTMaster ever call ```getServerStatus```? It seems unnecessary since it already has all the data. Now that we also have a MOTWorker we should put this and other methods / fields in there (```askTracker```, ```communicate```, ```val fetching```, ```getServerStatus``` and ```updateEpoch```). Right now it's a little confusing. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user CodingCat commented on a diff in the pull request: https://github.com/apache/spark/pull/101#discussion_r10576332 --- Diff: core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala --- @@ -222,4 +232,19 @@ private[spark] object HadoopRDD { def putCachedMetadata(key: String, value: Any) = SparkEnv.get.hadoopJobMetadata.put(key, value) + + def addLocalConfiguration(jobTrackerId: String, jobId: Int, splitId: Int, attemptId: Int, +conf: JobConf) { +// generate job id +//val stageId = context.stageId +val jobID = new JobID(jobTrackerId, jobId) +//val attemptId = (attemptId % Int.MaxValue).toInt --- End diff -- Oh, sorry, sorry, I forgot to do that~ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/101#issuecomment-37563131 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37563870 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user aarondav commented on a diff in the pull request: https://github.com/apache/spark/pull/101#discussion_r10576136 --- Diff: core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala --- @@ -222,4 +232,19 @@ private[spark] object HadoopRDD { def putCachedMetadata(key: String, value: Any) = SparkEnv.get.hadoopJobMetadata.put(key, value) + + def addLocalConfiguration(jobTrackerId: String, jobId: Int, splitId: Int, attemptId: Int, --- End diff -- Maybe add a comment like `/** Add Hadoop configuration specific to a single partition and attempt. */` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-782. Shade ASM
Github user pwendell commented on the pull request: https://github.com/apache/spark/pull/90#issuecomment-37563984 Do you mind closing this? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-782. Shade ASM
Github user sryza closed the pull request at: https://github.com/apache/spark/pull/90 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user sryza commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37564441 Updated patch with feedback --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10577891 --- Diff: core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala --- @@ -17,28 +17,24 @@ package org.apache.spark.scheduler +import scala.collection.mutable.HashMap + import java.io._ import java.util.zip.{GZIPInputStream, GZIPOutputStream} -import scala.collection.mutable.HashMap - import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDDCheckpointData +import org.apache.spark.rdd.{RDD, RDDCheckpointData} import org.apache.spark.storage._ -import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} +import org.apache.spark.util.BoundedHashMap private[spark] object ShuffleMapTask { // A simple map between the stage id to the serialized byte array of a task. // Served as a cache for task serialization because serialization can be // expensive on the master node if it needs to launch thousands of tasks. - val serializedInfoCache = new TimeStampedHashMap[Int, Array[Byte]] - - // TODO: This object shouldn't have global variables - val metadataCleaner = new MetadataCleaner( -MetadataCleanerType.SHUFFLE_MAP_TASK, serializedInfoCache.clearOldValues, new SparkConf) + val MAX_CACHE_SIZE = 100 + val serializedInfoCache = new BoundedHashMap[Int, Array[Byte]](MAX_CACHE_SIZE, true) --- End diff -- Okay I looked into this more. I believe this can just be deleted when the rest of the stage-related data structures get deleted in the DAGScheduler. @markhamstra has done a good job of making sure that the state gets cleaned up there at the right time. Then we don't need to have any of the TimeStamped or Bounded stuff here. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10578300 --- Diff: core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala --- @@ -17,28 +17,24 @@ package org.apache.spark.scheduler +import scala.collection.mutable.HashMap + import java.io._ import java.util.zip.{GZIPInputStream, GZIPOutputStream} -import scala.collection.mutable.HashMap - import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDDCheckpointData +import org.apache.spark.rdd.{RDD, RDDCheckpointData} import org.apache.spark.storage._ -import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} +import org.apache.spark.util.BoundedHashMap private[spark] object ShuffleMapTask { // A simple map between the stage id to the serialized byte array of a task. // Served as a cache for task serialization because serialization can be // expensive on the master node if it needs to launch thousands of tasks. - val serializedInfoCache = new TimeStampedHashMap[Int, Array[Byte]] - - // TODO: This object shouldn't have global variables - val metadataCleaner = new MetadataCleaner( -MetadataCleanerType.SHUFFLE_MAP_TASK, serializedInfoCache.clearOldValues, new SparkConf) + val MAX_CACHE_SIZE = 100 + val serializedInfoCache = new BoundedHashMap[Int, Array[Byte]](MAX_CACHE_SIZE, true) --- End diff -- @pwendell Wait, actually isn't that right? Correct me if I'm wrong, but once a ShuffleDependency or an RDD goes out of scope, then all associated stages can be safely removed, since to run tasks in the Stage you need this information. This suggests if we keep track of a list of stage IDs in both ShuffleDependency and RDD, then we can clean up this map (and the one in ResultTask) explicitly. @tdas, looks like this map only concerns the master, so I don't think we need to notify the workers of stage completion. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user markhamstra commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10578363 --- Diff: core/src/main/scala/org/apache/spark/ContextCleaner.scala --- @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} + +import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} + +/** Listener class used for testing when any item has been cleaned by the Cleaner class */ +private[spark] trait CleanerListener { + def rddCleaned(rddId: Int) + def shuffleCleaned(shuffleId: Int) +} + +/** + * Cleans RDDs and shuffle data. + */ +private[spark] class ContextCleaner(sc: SparkContext) extends Logging { + + /** Classes to represent cleaning tasks */ + private sealed trait CleaningTask + private case class CleanRDD(rddId: Int) extends CleaningTask + private case class CleanShuffle(shuffleId: Int) extends CleaningTask + // TODO: add CleanBroadcast + + private val queue = new LinkedBlockingQueue[CleaningTask] + + protected val listeners = new ArrayBuffer[CleanerListener] +with SynchronizedBuffer[CleanerListener] + + private val cleaningThread = new Thread() { override def run() { keepCleaning() }} + + @volatile private var stopped = false + + /** Start the cleaner */ + def start() { +cleaningThread.setDaemon(true) +cleaningThread.start() + } + + /** Stop the cleaner */ + def stop() { +stopped = true +cleaningThread.interrupt() + } + + /** + * Clean (unpersist) RDD data. Do not perform any time or resource intensive + * computation in this function as this is called from a finalize() function. + */ + def cleanRDD(rddId: Int) { +enqueue(CleanRDD(rddId)) +logDebug(Enqueued RDD + rddId + for cleaning up) + } + + /** + * Clean shuffle data. Do not perform any time or resource intensive + * computation in this function as this is called from a finalize() function. + */ + def cleanShuffle(shuffleId: Int) { +enqueue(CleanShuffle(shuffleId)) +logDebug(Enqueued shuffle + shuffleId + for cleaning up) + } + + /** Attach a listener object to get information of when objects are cleaned. */ + def attachListener(listener: CleanerListener) { +listeners += listener + } + + /** + * Enqueue a cleaning task. Do not perform any time or resource intensive + * computation in this function as this is called from a finalize() function. + */ + private def enqueue(task: CleaningTask) { +queue.put(task) + } + + /** Keep cleaning RDDs and shuffle data */ + private def keepCleaning() { +try { + while (!isStopped) { +val taskOpt = Option(queue.poll(100, TimeUnit.MILLISECONDS)) +taskOpt.foreach(task = { + logDebug(Got cleaning task + taskOpt.get) + task match { +case CleanRDD(rddId) = doCleanRDD(sc, rddId) +case CleanShuffle(shuffleId) = doCleanShuffle(shuffleId) + } +}) + } +} catch { + case ie: InterruptedException = +if (!isStopped) logWarning(Cleaning thread interrupted) +} + } + + /** Perform RDD cleaning */ + private def doCleanRDD(sc: SparkContext, rddId: Int) { +logDebug(Cleaning rdd + rddId) +blockManagerMaster.removeRdd(rddId, false) --- End diff -- ``blockManagerMaster.removeRdd(rddId, blocking = false)`` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/101#issuecomment-37570650 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1128: set hadoop task properties when co...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/101#issuecomment-37570651 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13162/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37570639 All automated tests passed. Refer to this link for build results: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/13163/ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37570638 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10579167 --- Diff: core/src/main/scala/org/apache/spark/MapOutputTracker.scala --- @@ -181,15 +178,50 @@ private[spark] class MapOutputTracker(conf: SparkConf) extends Logging { } } +/** + * MapOutputTracker for the workers. This uses BoundedHashMap to keep track of + * a limited number of most recently used map output information. + */ +private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTracker(conf) { + + /** + * Bounded HashMap for storing serialized statuses in the worker. This allows + * the HashMap stay bounded in memory-usage. Things dropped from this HashMap will be + * automatically repopulated by fetching them again from the driver. Its okay to + * keep the cache size small as it unlikely that there will be a very large number of + * stages active simultaneously in the worker. + */ + protected val mapStatuses = new BoundedHashMap[Int, Array[MapStatus]]( --- End diff -- Right, what TD is saying is that this particular map in MOTWorker is not concerned with stage IDs (but rather with shuffle IDs). In other words, the driver doesn't need to communicate stage information to the Executors, since the Executors do not maintain maps that depend on stage IDs, AFAIA. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1186] : Enrich the Spark Shell to suppo...
Github user berngp commented on a diff in the pull request: https://github.com/apache/spark/pull/116#discussion_r10579188 --- Diff: bin/spark-shell --- @@ -30,69 +30,367 @@ esac # Enter posix mode for bash set -o posix -CORE_PATTERN=^[0-9]+$ -MEM_PATTERN=^[0-9]+[m|g|M|G]$ - +## Global script variables FWDIR=$(cd `dirname $0`/..; pwd) -if [ $1 = --help ] || [ $1 = -h ]; then - echo Usage: spark-shell [OPTIONS] - echo OPTIONS: - echo -c --cores num, the maximum number of cores to be used by the spark shell - echo -em --execmem num[m|g], the memory used by each executor of spark shell - echo -dm --drivermem num[m|g], the memory used by the spark shell and driver - echo -h --help, print this help information - exit -fi +VERBOSE=0 +DRY_RUN=0 +SPARK_REPL_OPTS=${SPARK_REPL_OPTS:-} +MASTER= + +#CLI Color Templates +txtund=$(tput sgr 0 1) # Underline +txtbld=$(tput bold) # Bold +bldred=${txtbld}$(tput setaf 1) # red +bldyel=${txtbld}$(tput setaf 3) # yellow +bldblu=${txtbld}$(tput setaf 4) # blue +bldwht=${txtbld}$(tput setaf 7) # white +txtrst=$(tput sgr0) # Reset +info=${bldwht}*${txtrst}# Feedback +pass=${bldblu}*${txtrst} +warn=${bldred}*${txtrst} +ques=${bldblu}?${txtrst} + +# Helper function to describe the script usage +function usage() { +cat EOF + +${txtbld}Usage${txtrst}: spark-shell [OPTIONS] + +${txtbld}OPTIONS${txtrst}: + +${txtund}basic${txtrst}: + +-h --help : print this help information. +-c --executor-cores: the maximum number of cores to be used by the spark shell. +-em --executor-memory : num[m|g], the memory used by each executor of spark shell. +-dm --drivermem --driver-memory : num[m|g], the memory used by the spark shell and driver. + +${txtund}soon to be deprecated${txtrst}: + +--cores : please use -c/--executor-cores + +${txtund}other options${txtrst}: + +-mip --master-ip : Spark Master IP/Host Address +-mp --master-port : num, Spark Master Port --- End diff -- How about? -em --executor-memory : the memory used by each executor of spark shell followed by m for megabytes or g for gigabytes, e.g. 1g. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/135#issuecomment-37570921 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37570930 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37570929 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/135#discussion_r10579726 --- Diff: core/src/main/scala/org/apache/spark/rdd/RDD.scala --- @@ -310,6 +310,9 @@ abstract class RDD[T: ClassTag]( * Return a sampled subset of this RDD. */ def sample(withReplacement: Boolean, fraction: Double, seed: Int): RDD[T] = { +if (fraction Double.MinValue || fraction Double.MaxValue) { --- End diff -- Use require. i.e. ```scala require(fraction Double.MinValue fraction Double.MaxValue, ...) ``` Shouldn't you just check for fraction 0 but 1? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user CodingCat commented on a diff in the pull request: https://github.com/apache/spark/pull/135#discussion_r10580163 --- Diff: core/src/main/scala/org/apache/spark/rdd/RDD.scala --- @@ -310,6 +310,9 @@ abstract class RDD[T: ClassTag]( * Return a sampled subset of this RDD. */ def sample(withReplacement: Boolean, fraction: Double, seed: Int): RDD[T] = { +if (fraction Double.MinValue || fraction Double.MaxValue) { --- End diff -- Hi, @rxin , I'm also a bit confused here, I think the name of the argument is a bit confusing https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/rdd/RDD.scala#L357 The above line contains a multiplier to ensure that the sampling can return enough sample points in most of cases..(I think so), so the fraction value can actually be larger than 1 also, this value actually determines the mean value of Poisson/Bernoulli distribution https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/rdd/RDD.scala#L314 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/135#discussion_r10580151 --- Diff: core/src/main/scala/org/apache/spark/rdd/RDD.scala --- @@ -310,6 +310,9 @@ abstract class RDD[T: ClassTag]( * Return a sampled subset of this RDD. */ def sample(withReplacement: Boolean, fraction: Double, seed: Int): RDD[T] = { +if (fraction Double.MinValue || fraction Double.MaxValue) { --- End diff -- The lower bound should be = 0.0. Sample with replacement can have a faction greater than 1.0. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [MLLIB-18] [WIP] Adding sparse data support an...
Github user dlwh commented on the pull request: https://github.com/apache/spark/pull/117#issuecomment-37572686 I'll release something this weekend. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1132] Persisting Web UI through refacto...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/42#discussion_r10580179 --- Diff: core/src/main/scala/org/apache/spark/scheduler/SparkReplayerBus.scala --- @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import java.io.InputStream + +import scala.io.Source + +import it.unimi.dsi.fastutil.io.FastBufferedInputStream +import org.apache.hadoop.fs.{Path, FileSystem} +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.{Logging, SparkConf} +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.{JsonProtocol, Utils} + +/** + * An EventBus that replays logged events from persisted storage + */ +private[spark] class SparkReplayerBus(conf: SparkConf) extends EventBus with Logging { + private val compressed = conf.getBoolean(spark.eventLog.compress, false) + + // Only used if compression is enabled + private lazy val compressionCodec = CompressionCodec.createCodec(conf) + + /** + * Return a list of paths representing log files in the given directory. + */ + private def getLogFilePaths(logDir: String, fileSystem: FileSystem): Array[Path] = { +val path = new Path(logDir) +if (!fileSystem.exists(path) || !fileSystem.getFileStatus(path).isDir) { + logWarning(Log path provided is not a valid directory: %s.format(logDir)) + return Array[Path]() +} +val logStatus = fileSystem.listStatus(path) +if (logStatus == null || !logStatus.exists(!_.isDir)) { + logWarning(Log path provided contains no log files: %s.format(logDir)) + return Array[Path]() +} +logStatus.filter(!_.isDir).map(_.getPath).sortBy(_.getName) + } + + /** + * Replay each event in the order maintained in the given logs. + */ + def replay(logDir: String): Boolean = { +val fileSystem = Utils.getHadoopFileSystem(logDir) +val logPaths = getLogFilePaths(logDir, fileSystem) +if (logPaths.length == 0) { + return false +} + +logPaths.foreach { path = + // In case there is an exception, keep track of the highest level stream to close it later + var streamToClose: Option[InputStream] = None + var currentLine = + try { +val fstream = fileSystem.open(path) +val bstream = new FastBufferedInputStream(fstream) +val cstream = if (compressed) compressionCodec.compressedInputStream(bstream) else bstream +streamToClose = Some(cstream) + +// Parse each line as an event and post it to all attached listeners +val lines = Source.fromInputStream(cstream).getLines() +lines.foreach { line = + currentLine = line + val event = JsonProtocol.sparkEventFromJson(parse(line)) + postToAll(event) +} + } catch { +case e: Exception = + logWarning(Exception in parsing UI logs for %s.format(path)) --- End diff -- Oops. That was left over from before I moved it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1241] Add sliding to RDD
GitHub user mengxr opened a pull request: https://github.com/apache/spark/pull/136 [SPARK-1241] Add sliding to RDD Sliding is useful for operations like creating n-grams, calculating total variation, numerical integration, etc. This is similar to https://github.com/apache/incubator-spark/pull/18, but implemented differently. JIRA: https://spark-project.atlassian.net/browse/SPARK-1241 You can merge this pull request into a Git repository by running: $ git pull https://github.com/mengxr/spark sliding Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/136.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #136 commit d2a600d5c0ab8a068cb23bdd422645d8b1a39f0b Author: Xiangrui Meng m...@databricks.com Date: 2014-03-13T08:47:45Z add sliding to rdd commit 5ee6001471b1897400fef1e35b5e10fbfb47395f Author: Xiangrui Meng m...@databricks.com Date: 2014-03-13T18:49:04Z add TODO --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1132] Persisting Web UI through refacto...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/42#discussion_r10580856 --- Diff: core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala --- @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.{Logging, SparkConf} +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.{JsonProtocol, FileLogger} + +/** + * A SparkListener that logs events to persistent storage. + * + * Event logging is specified by the following configurable parameters: + * spark.eventLog.enabled - Whether event logging is enabled. + * spark.eventLog.compress - Whether to compress logged events + * spark.eventLog.overwrite - Whether to overwrite any existing files. + * spark.eventLog.dir - Path to the directory in which events are logged. + * spark.eventLog.buffer.kb - Buffer size to use when writing to output streams + */ +private[spark] class EventLoggingListener(appName: String, conf: SparkConf) + extends SparkListener with Logging { + + private val shouldCompress = conf.getBoolean(spark.eventLog.compress, false) + private val shouldOverwrite = conf.getBoolean(spark.eventLog.overwrite, true) --- End diff -- Sounds good. (Though the chance of a file name collision is actually quite low, since we also attach a timestamp to it, which is why I didn't expose this config to the user) --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/135#discussion_r10580942 --- Diff: core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala --- @@ -457,6 +457,10 @@ class RDDSuite extends FunSuite with SharedSparkContext { test(takeSample) { val data = sc.parallelize(1 to 100, 2) +val emptySet = data.filter(_ = false) --- End diff -- Is there a better way to create an empty RDD? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10581534 --- Diff: core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala --- @@ -17,28 +17,24 @@ package org.apache.spark.scheduler +import scala.collection.mutable.HashMap + import java.io._ import java.util.zip.{GZIPInputStream, GZIPOutputStream} -import scala.collection.mutable.HashMap - import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDDCheckpointData +import org.apache.spark.rdd.{RDD, RDDCheckpointData} import org.apache.spark.storage._ -import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} +import org.apache.spark.util.BoundedHashMap private[spark] object ShuffleMapTask { // A simple map between the stage id to the serialized byte array of a task. // Served as a cache for task serialization because serialization can be // expensive on the master node if it needs to launch thousands of tasks. - val serializedInfoCache = new TimeStampedHashMap[Int, Array[Byte]] - - // TODO: This object shouldn't have global variables - val metadataCleaner = new MetadataCleaner( -MetadataCleanerType.SHUFFLE_MAP_TASK, serializedInfoCache.clearOldValues, new SparkConf) + val MAX_CACHE_SIZE = 100 + val serializedInfoCache = new BoundedHashMap[Int, Array[Byte]](MAX_CACHE_SIZE, true) --- End diff -- @andrewor14 what I'm suggest is even simpler than that (for this specific data structure). When the stage finishes, you can just remove the associated entries in this hash map. Once a given stages finishes we will never re-execute the a stage with the same stageId AFIAK. Maybe @markhamstra could confirm this. The thing you are suggesting is needed (I think) but not for this particular case where I believe something simpler will suffice. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10581648 --- Diff: core/src/main/scala/org/apache/spark/MapOutputTracker.scala --- @@ -181,15 +178,50 @@ private[spark] class MapOutputTracker(conf: SparkConf) extends Logging { } } +/** + * MapOutputTracker for the workers. This uses BoundedHashMap to keep track of + * a limited number of most recently used map output information. + */ +private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTracker(conf) { + + /** + * Bounded HashMap for storing serialized statuses in the worker. This allows + * the HashMap stay bounded in memory-usage. Things dropped from this HashMap will be + * automatically repopulated by fetching them again from the driver. Its okay to + * keep the cache size small as it unlikely that there will be a very large number of + * stages active simultaneously in the worker. + */ + protected val mapStatuses = new BoundedHashMap[Int, Array[MapStatus]]( --- End diff -- I see - so then isn't this even simpler. We just add a message to clean up the ShuffleId once the shuffle dependency goes out of scope? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/120#issuecomment-37577269 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1241] Add sliding to RDD
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/136#issuecomment-37577527 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1183. Don't use worker to mean executo...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/120 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/135#issuecomment-37577534 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user pwendell commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10582800 --- Diff: core/src/main/scala/org/apache/spark/MapOutputTracker.scala --- @@ -181,15 +178,50 @@ private[spark] class MapOutputTracker(conf: SparkConf) extends Logging { } } +/** + * MapOutputTracker for the workers. This uses BoundedHashMap to keep track of + * a limited number of most recently used map output information. + */ +private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTracker(conf) { + + /** + * Bounded HashMap for storing serialized statuses in the worker. This allows + * the HashMap stay bounded in memory-usage. Things dropped from this HashMap will be + * automatically repopulated by fetching them again from the driver. Its okay to + * keep the cache size small as it unlikely that there will be a very large number of + * stages active simultaneously in the worker. + */ + protected val mapStatuses = new BoundedHashMap[Int, Array[MapStatus]]( --- End diff -- Andrew and I did some pair programming and I think we figured out how to remove all of the BoundedHashMap's: https://github.com/pwendell/spark/commit/dc42db62426fddc8cbe961d9c2b3af1bf1ad14c5 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/135#discussion_r10583451 --- Diff: core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala --- @@ -457,6 +457,10 @@ class RDDSuite extends FunSuite with SharedSparkContext { test(takeSample) { val data = sc.parallelize(1 to 100, 2) +val emptySet = data.mapPartitions { iter = Iterator.empty } --- End diff -- Let us create a separate test takeSample from an empty rdd and construct an empty rdd directly: ~~ val emptyRdd = sc.parallelize(Seq.empty[Int], 2) ~~ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1132] Persisting Web UI through refacto...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/42#discussion_r10584092 --- Diff: core/src/main/scala/org/apache/spark/scheduler/SparkReplayerBus.scala --- @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import java.io.InputStream + +import scala.io.Source + +import it.unimi.dsi.fastutil.io.FastBufferedInputStream +import org.apache.hadoop.fs.{Path, FileSystem} +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.{Logging, SparkConf} +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.{JsonProtocol, Utils} + +/** + * An EventBus that replays logged events from persisted storage + */ +private[spark] class SparkReplayerBus(conf: SparkConf) extends EventBus with Logging { --- End diff -- I think something you can attach listeners to pretty much means pub-sub (the difference is how often you publish; in replay mode we publish once in the beginning). I propose the following: trait SparkListenerBus ReplayListenerBus extends SparkListenerBus LiveListenerBus / PollingListenerBus / AsynchronousListenerBus extends SparkListenerBus --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1241] Add sliding to RDD
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/136#issuecomment-37583751 Merged build finished. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Bundle tachyon
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/137#issuecomment-37584054 Can one of the admins verify this patch? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: Bundle tachyon
GitHub user nicklan opened a pull request: https://github.com/apache/spark/pull/137 Bundle tachyon This should all work as expected with the current version of the tachyon tarball (0.4.1) You can merge this pull request into a Git repository by running: $ git pull https://github.com/nicklan/spark bundle-tachyon Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/137.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #137 commit 5fef6dd3fe67867317fec9a85ad1ef3432250fa4 Author: Nick Lanham n...@afternight.org Date: 2014-02-22T01:26:55Z Add scripts using tachyon tarball - currently won't work as we're waiting on the new bin/tachyon script from tachyon commit 819cb8c95bf7a089843d6634cc0f8769504dd4ff Author: Nick Lanham n...@afternight.org Date: 2014-02-28T21:35:47Z Update to tachyon 0.4.1 - This should now work (tested on ec2) commit 11f1ae66e9259cd2b86080305e8c5a9fe200c584 Author: Nick Lanham n...@afternight.org Date: 2014-02-28T22:50:12Z Copy over web resources so web interface can run commit a088e0481f7d64368f2ce3e832913ef6ac836846 Author: Nick Lanham n...@afternight.org Date: 2014-03-13T20:35:45Z Only try tachyon operations if tachyon script exists --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/135#issuecomment-37584222 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user andrewor14 commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10585234 --- Diff: core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala --- @@ -17,28 +17,24 @@ package org.apache.spark.scheduler +import scala.collection.mutable.HashMap + import java.io._ import java.util.zip.{GZIPInputStream, GZIPOutputStream} -import scala.collection.mutable.HashMap - import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDDCheckpointData +import org.apache.spark.rdd.{RDD, RDDCheckpointData} import org.apache.spark.storage._ -import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} +import org.apache.spark.util.BoundedHashMap private[spark] object ShuffleMapTask { // A simple map between the stage id to the serialized byte array of a task. // Served as a cache for task serialization because serialization can be // expensive on the master node if it needs to launch thousands of tasks. - val serializedInfoCache = new TimeStampedHashMap[Int, Array[Byte]] - - // TODO: This object shouldn't have global variables - val metadataCleaner = new MetadataCleaner( -MetadataCleanerType.SHUFFLE_MAP_TASK, serializedInfoCache.clearOldValues, new SparkConf) + val MAX_CACHE_SIZE = 100 + val serializedInfoCache = new BoundedHashMap[Int, Array[Byte]](MAX_CACHE_SIZE, true) --- End diff -- Right, if we just directly remove the stage as soon as it finishes, then we don't need to keep track of the stages associated with ShuffleDependency's and RDD's. This is because by the time the dependency or RDD goes out of scope, the stage would have already been removed. This gets rid of the BoundedHashMap in `ShuffleMapTask` and `ResultTask`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: [SPARK-1103] [WIP] Automatic garbage collectio...
Github user markhamstra commented on a diff in the pull request: https://github.com/apache/spark/pull/126#discussion_r10586047 --- Diff: core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala --- @@ -17,28 +17,24 @@ package org.apache.spark.scheduler +import scala.collection.mutable.HashMap + import java.io._ import java.util.zip.{GZIPInputStream, GZIPOutputStream} -import scala.collection.mutable.HashMap - import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDDCheckpointData +import org.apache.spark.rdd.{RDD, RDDCheckpointData} import org.apache.spark.storage._ -import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} +import org.apache.spark.util.BoundedHashMap private[spark] object ShuffleMapTask { // A simple map between the stage id to the serialized byte array of a task. // Served as a cache for task serialization because serialization can be // expensive on the master node if it needs to launch thousands of tasks. - val serializedInfoCache = new TimeStampedHashMap[Int, Array[Byte]] - - // TODO: This object shouldn't have global variables - val metadataCleaner = new MetadataCleaner( -MetadataCleanerType.SHUFFLE_MAP_TASK, serializedInfoCache.clearOldValues, new SparkConf) + val MAX_CACHE_SIZE = 100 + val serializedInfoCache = new BoundedHashMap[Int, Array[Byte]](MAX_CACHE_SIZE, true) --- End diff -- This is because by the time the dependency or RDD goes out of scope, the stage will already have been removed. Right, but do be aware that it doesn't work the other way around. A stage and stageId can be created and associated with a ShuffleDependency when a job runs, then that stage and stageId can disappear from the DAGScheduler when the job completes (finished, canceled or aborted); but metadata, cached data, etc. for the associated ShuffleDependency should stick around as long as that ShuffleDependency is in scope, since DAGScheduler#newOrUsedStage will want to make use of prior mapOutputs (now associated with a fresh stageId) when it can instead of forcing re-evaluation of those results. Just because one job and stage is done with a shuffleDep, and as long as that shuffleDep is in scope from some RDD, that doesn't me that another job will not want to make use of that shuffleDep. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---
[GitHub] spark pull request: SPARK-1240: handle the case of empty RDD when ...
Github user mengxr commented on the pull request: https://github.com/apache/spark/pull/135#issuecomment-37587059 LGTM. Waiting for Jenkins. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---