Usually Spark EMR job fails with the following exception in 1 hour 40 min - Job cancelled because SparkContext was shut down
java.util.concurrent.RejectedExecutionException: Task scala.concurrent.impl.CallbackRunnable@2d602a14 rejected from java.util.concurrent.ThreadPoolExecutor@46a9e52[Terminated, pool size = 0, active threads = 0, queued tasks = 0, completed tasks = 6294] at java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048) at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821) at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372) at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at scala.concurrent.Promise$class.complete(Promise.scala:55) at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at scala.concurrent.Promise$class.complete(Promise.scala:55) at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72) at scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634) at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at scala.concurrent.Promise$class.complete(Promise.scala:55) at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334) at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117) at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691) at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467) at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419) at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423) at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375) at java.lang.Thread.run(Thread.java:745) Exception in thread "main" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703) at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702) at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514) at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84) at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438) at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724) at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185) at org.apache.spark.SparkContext.stop(SparkContext.scala:1723) at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1063) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.RDD.fold(RDD.scala:1057) at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:34) at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34) at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:33) at com.radius.core.util.SparkUtils$.estimateNewPartitionsNum(SparkUtils.scala:41) at com.radius.core.util.SparkUtils$.coalesceRdd(SparkUtils.scala:35) at com.radius.distiller.Distiller.saveExtract(Distiller.scala:75) at com.radius.distiller.Execute$.run(Execute.scala:55) at com.radius.distiller.Execute$.main(Execute.scala:29) at com.radius.distiller.Execute.main(Execute.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Command exiting with ret '1' On Sun, Dec 20, 2015 at 11:29 AM, Alexander Pivovarov <apivova...@gmail.com> wrote: > Or this message > > Exception in thread "main" org.apache.spark.SparkException: Job cancelled > because SparkContext was shut down > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702) > at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) > at > org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514) > at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84) > at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438) > at > org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724) > at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185) > at org.apache.spark.SparkContext.stop(SparkContext.scala:1723) > at > org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) > at > org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) > at > org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) > at > org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) > at > com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) > at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) > at com.radius.distiller.Execute$.run(Execute.scala:56) > at com.radius.distiller.Execute$.main(Execute.scala:33) > at com.radius.distiller.Execute.main(Execute.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) > at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) > > > On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov < > apivova...@gmail.com> wrote: > >> it can also fail with the following message >> >> Exception in thread "main" org.apache.spark.SparkException: Job aborted due >> to stage failure: Task 133 in stage 33.1 failed 4 times, most recent >> failure: Lost task 133.3 in stage 33.1 (TID 172737, >> ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to >> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >> at >> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) >> at >> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) >> at >> org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) >> at >> org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) >> at >> org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) >> at >> org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) >> at >> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) >> at java.util.concurrent.FutureTask.run(FutureTask.java:262) >> at >> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) >> at >> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) >> at java.lang.Thread.run(Thread.java:745) >> Caused by: java.net.ConnectException: Connection refused: >> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) >> at >> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) >> at >> io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) >> at >> io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) >> at >> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) >> at >> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) >> at >> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) >> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) >> at >> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) >> ... 1 more >> >> Driver stacktrace: >> at >> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283) >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271) >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270) >> at >> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) >> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) >> at >> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270) >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) >> at scala.Option.foreach(Option.scala:236) >> at >> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697) >> at >> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496) >> at >> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458) >> at >> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447) >> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) >> at >> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) >> at >> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) >> at >> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >> at >> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) >> at >> com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) >> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) >> at com.radius.distiller.Execute$.run(Execute.scala:56) >> at com.radius.distiller.Execute$.main(Execute.scala:33) >> at com.radius.distiller.Execute.main(Execute.scala) >> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >> at >> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) >> at >> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >> at java.lang.reflect.Method.invoke(Method.java:606) >> at >> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) >> at >> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) >> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) >> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >> Caused by: java.io.IOException: Failed to connect to >> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >> at >> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) >> at >> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) >> at >> org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) >> at >> org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) >> at >> org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) >> at >> org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) >> at >> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) >> at java.util.concurrent.FutureTask.run(FutureTask.java:262) >> at >> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) >> at >> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) >> at java.lang.Thread.run(Thread.java:745) >> Caused by: java.net.ConnectException: Connection refused: >> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) >> at >> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) >> at >> io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) >> at >> io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) >> at >> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) >> at >> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) >> at >> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) >> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) >> at >> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) >> ... 1 more >> >> >> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov < >> apivova...@gmail.com> wrote: >> >>> I run Spark 1.5.2 on YARN (EMR) >>> >>> I noticed that my long running jobs always failed after 1h 40 min >>> (6000s) with the exceptions below. >>> >>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default >>> >>> I changed the settings to the following and it solve my issue. >>> >>> "spark.akka.heartbeat.pauses": "60000s", >>> "spark.akka.heartbeat.interval": "10000s" >>> >>> >>> >>> RROR ErrorMonitor - Uncaught fatal error from thread >>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>> ActorSystem [sparkDriver] >>> java.lang.OutOfMemoryError: Java heap space >>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) >>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) >>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>> at >>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>> at >>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>> at >>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>> at >>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>> ActorSystem [sparkDriver] >>> java.lang.OutOfMemoryError: Java heap space >>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) >>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) >>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>> at >>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>> at >>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>> at >>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>> at >>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>> [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down >>> ActorSystem [sparkDriver] >>> java.lang.OutOfMemoryError: Java heap space >>> at java.util.Arrays.copyOf(Arrays.java:2271) >>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118) >>> at >>> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) >>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) >>> at >>> java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876) >>> at >>> java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785) >>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188) >>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) >>> at >>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129) >>> at >>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) >>> at >>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) >>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129) >>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>> at >>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>> at >>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>> at >>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>> at >>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>> ActorSystem [sparkDriver] >>> java.lang.OutOfMemoryError: Java heap space >>> at >>> com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) >>> at >>> akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138) >>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740) >>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>> at >>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>> at >>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>> >>> at >>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>> >>> >> >> >