Hello, I have a random forest that works fine with 20 trees on 5e6 LabeledPoints for training and 300 features... but when I try to scale it up just a bit to 60 or 100 trees and 10e6 training points, it consistently gets ExecutorLostFailure's due to "no recent heartbeats" with timeout of 120s. (more detail on error below)
what I'm hoping for, in addition to understanding why it's failing... what's a practical approach to making this work? - change some spark parameters? i.e. increase spark.executor.heartbeatInterval? - larger cluster? - (last resort) smaller or fewer trees? any advice appreciated. thank you! // rf params val numClasses = 2 val numTrees = 100 val featureSubsetStrategy = "auto" val maxDepth = 21 val maxBins = 16 val subsamplingRate = 0.8 val minInstancesPerNode = 9 val maxMemoryInMB = 3000 val useNodeIdCache = true // error details 16/05/12 15:55:38 WARN HeartbeatReceiver: Removing executor 4 with no recent heartbeats: 126112 ms exceeds timeout 120000 ms 16/05/12 15:55:38 ERROR YarnScheduler: Lost executor 4 on ip-10-101-xx-xx.us-west-2.compute.internal: Executor heartbeat timed out after 126112 ms 16/05/12 15:55:38 WARN TaskSetManager: Lost task 72.3 in stage 38.0 (TID 29394, ip-10-101-xx-xx.us-west-2.compute.internal): ExecutorLostFailure (executor 4 exited caused by one of the run 16/05/12 15:55:38 INFO YarnClientSchedulerBackend: Requesting to kill executor(s) 4 16/05/12 15:55:38 INFO YarnScheduler: Cancelling stage 38 16/05/12 15:55:38 INFO YarnScheduler: Stage 38 was cancelled 16/05/12 15:55:38 INFO DAGScheduler: ShuffleMapStage 38 (mapPartitions at DecisionTree.scala:604) failed in 3732.026 s 16/05/12 15:55:38 INFO DAGScheduler: Executor lost: 4 (epoch 18) 16/05/12 15:55:38 INFO DAGScheduler: Job 20 failed: collectAsMap at DecisionTree.scala:651, took 3739.423425 s 16/05/12 15:55:38 INFO BlockManagerMasterEndpoint: Trying to remove executor 4 from BlockManagerMaster. 16/05/12 15:55:38 INFO BlockManagerMasterEndpoint: Removing block manager BlockManagerId(4, ip-10-101-xx-xx.us-west-2.compute.internal, 41007) 16/05/12 15:55:38 INFO BlockManagerMaster: Removed 4 successfully in removeExecutor 16/05/12 15:55:38 INFO DAGScheduler: Host added was in lost list earlier: ip-10-101-xx-xx.us-west-2.compute.internal org.apache.spark.SparkException: Job aborted due to stage failure: Task 72 in stage 38.0 failed 4 times, most recent failure: Lost task 72.3 in stage 38.0 (TID 29394, ip-10-101-xx-xx.us-west-2.compute.internal): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 126112 ms Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.RDD.collect(RDD.scala:926) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:741) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:740) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:740) at org.apache.spark.mllib.tree.DecisionTree$.findBestSplits(DecisionTree.scala:651) at org.apache.spark.mllib.tree.RandomForest.run(RandomForest.scala:233) at org.apache.spark.mllib.tree.RandomForest$.trainClassifier(RandomForest.scala:289) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:52) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:58) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:60) at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:62) at $iwC$$iwC$$iwC$$iwC.<init>(<console>:64) at $iwC$$iwC$$iwC.<init>(<console>:66) at $iwC$$iwC.<init>(<console>:68) at $iwC.<init>(<console>:70) at <init>(<console>:72) at .<init>(<console>:76) at .<clinit>(<console>) at .<init>(<console>:7) at .<clinit>(<console>) at $print(<console>) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065) at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346) at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819) at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857) at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902) at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:875) at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902) at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814) at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657) at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670) at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997) at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945) at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059) at org.apache.spark.repl.Main$.main(Main.scala:31) at org.apache.spark.repl.Main.main(Main.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/mllib-random-forest-executor-heartbeat-timed-out-tp26943.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org