Hi, I am able to run the Triangle Count example with some smaller graphs but when I am using http://snap.stanford.edu/data/com-Friendster.html <http://snap.stanford.edu/data/com-Friendster.html> I am not able to get the job finished ok. For some reason Spark loses its executors. No matter what I use to configure spark (1.5) I just receive errors, the last configuration I’ve used was running for some time than it gave executors lost errors.
Some exceptions/errors I got: ERROR LiveListenerBus: Listener JobProgressListener threw an exception java.lang.NullPointerException at org.apache.spark.ui.jobs.JobProgressListener$$anonfun$onTaskEnd$1.apply(JobProgressListener.scala:362) at org.apache.spark.ui.jobs.JobProgressListener$$anonfun$onTaskEnd$1.apply(JobProgressListener.scala:361) at scala.collection.immutable.List.foreach(List.scala:318) at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:32) at scala.collection.mutable.ListBuffer.foreach(ListBuffer.scala:45) at org.apache.spark.ui.jobs.JobProgressListener.onTaskEnd(JobProgressListener.scala:361) at org.apache.spark.scheduler.SparkListenerBus$class.onPostEvent(SparkListenerBus.scala:42) at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31) at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31) at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:56) at org.apache.spark.util.AsynchronousListenerBus.postToAll(AsynchronousListenerBus.scala:37) at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(AsynchronousListenerBus.scala:79) at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1136) at org.apache.spark.util.AsynchronousListenerBus$$anon$1.run(AsynchronousListenerBus.scala:63) ERROR MapOutputTracker: Missing an output location for shuffle 1 ERROR TaskSchedulerImpl: Lost executor 29 on 172.16.96.49: worker lost ERROR TaskSchedulerImpl: Lost executor 24 on 172.16.96.39: worker lost 16/02/16 12:41:47 WARN HeartbeatReceiver: Removing executor 8 with no recent heartbeats: 168312 ms exceeds timeout 120000 ms 16/02/16 12:41:47 ERROR TaskSchedulerImpl: Lost executor 8 on 172.16.96.53: Executor heartbeat timed out after 168312 ms 16/02/16 12:41:47 ERROR TaskSchedulerImpl: Lost executor 9 on 172.16.96.9: Executor heartbeat timed out after 163671 ms 16/02/16 12:53:53 ERROR TaskSetManager: Task 9 in stage 6.2 failed 4 times; aborting job 16/02/16 12:54:42 ERROR ContextCleaner: Error cleaning broadcast 19 org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 seconds]. This timeout is controlled by spark.rpc.askTimeout at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcEnv.scala:214) at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcEnv.scala:229) at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcEnv.scala:225) at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33) at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcEnv.scala:242) at org.apache.spark.storage.BlockManagerMaster.removeBroadcast(BlockManagerMaster.scala:136) at org.apache.spark.broadcast.TorrentBroadcast$.unpersist(TorrentBroadcast.scala:228) at org.apache.spark.broadcast.TorrentBroadcastFactory.unbroadcast(TorrentBroadcastFactory.scala:45) at org.apache.spark.broadcast.BroadcastManager.unbroadcast(BroadcastManager.scala:67) at org.apache.spark.ContextCleaner.doCleanupBroadcast(ContextCleaner.scala:214) at org.apache.spark.ContextCleaner$$anonfun$org$apache$spark$ContextCleaner$$keepCleaning$1$$anonfun$apply$mcV$sp$2.apply(ContextCleaner.scala:170) at org.apache.spark.ContextCleaner$$anonfun$org$apache$spark$ContextCleaner$$keepCleaning$1$$anonfun$apply$mcV$sp$2.apply(ContextCleaner.scala:161) at scala.Option.foreach(Option.scala:236) at org.apache.spark.ContextCleaner$$anonfun$org$apache$spark$ContextCleaner$$keepCleaning$1.apply$mcV$sp(ContextCleaner.scala:161) at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1136) at org.apache.spark.ContextCleaner.org$apache$spark$ContextCleaner$$keepCleaning(ContextCleaner.scala:154) at org.apache.spark.ContextCleaner$$anon$3.run(ContextCleaner.scala:67) Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 seconds] at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219) at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223) at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107) at scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53) at scala.concurrent.Await$.result(package.scala:107) at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcEnv.scala:241) ... 12 more 16/02/16 12:54:43 ERROR LiveListenerBus: SparkListenerBus has already stopped! Dropping event SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(24, 172.16.96.39, 34237),rdd_31_1253,StorageLevel(false, true, false, false, 1),4728689,0,0)) 16/02/16 12:54:43 WARN BlockManagerMaster: Failed to remove broadcast 19 with removeFromMaster = true - Ask timed out on [Actor[akka.tcp://sparkExecutor@172.16.96.49:50924/user/BlockManagerEndpoint1#-399414923]] after [120000 ms]. This timeout is controlled by spark.rpc.askTimeout org.apache.spark.rpc.RpcTimeoutException: Ask timed out on [Actor[akka.tcp://sparkExecutor@172.16.96.49:50924/user/BlockManagerEndpoint1#-399414923]] after [120000 ms]. This timeout is controlled by spark.rpc.askTimeout at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcEnv.scala:214) at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcEnv.scala:229) at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcEnv.scala:225) at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33) at scala.util.Failure$$anonfun$recover$1.apply(Try.scala:185) at scala.util.Try$.apply(Try.scala:161) at scala.util.Failure.recover(Try.scala:185) at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at scala.concurrent.Promise$class.complete(Promise.scala:55) at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72) at scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634) at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at scala.concurrent.Promise$class.complete(Promise.scala:55) at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334) at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117) at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691) at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467) at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419) at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423) at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375) at java.lang.Thread.run(Thread.java:745) Caused by: akka.pattern.AskTimeoutException: Ask timed out on [Actor[akka.tcp://sparkExecutor@172.16.96.49:50924/user/BlockManagerEndpoint1#-399414923]] after [120000 ms] ... 9 more Best, Ovidiu