Documentation says that this setting is used to disable Akka transport failure detector. Why magic number 6000s is used then? It should be maximum possible number instead of 6000s to disable heartbeat
Using magic numbers like 1 hour and 40 min creates issues which are difficult to debug. Most probably all Spark integration tests run faster that 6000s and this area is basically not tested. Anyone know why spark.akka.heartbeat.pauses=6000s ??? On Sun, Dec 20, 2015 at 9:40 PM, Josh Rosen <joshro...@databricks.com> wrote: > Would you mind copying this information into a JIRA ticket to make it > easier to discover / track? Thanks! > > On Sun, Dec 20, 2015 at 11:35 AM Alexander Pivovarov <apivova...@gmail.com> > wrote: > >> Usually Spark EMR job fails with the following exception in 1 hour 40 min >> - Job cancelled because SparkContext was shut down >> >> java.util.concurrent.RejectedExecutionException: Task >> scala.concurrent.impl.CallbackRunnable@2d602a14 rejected from >> java.util.concurrent.ThreadPoolExecutor@46a9e52[Terminated, pool size = 0, >> active threads = 0, queued tasks = 0, completed tasks = 6294] >> at >> java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048) >> at >> java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821) >> at >> java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372) >> at >> scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) >> at >> scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) >> at >> scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) >> at scala.concurrent.Promise$class.complete(Promise.scala:55) >> at >> scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) >> at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) >> at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) >> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) >> at >> org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) >> at >> scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) >> at >> scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) >> at >> scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) >> at scala.concurrent.Promise$class.complete(Promise.scala:55) >> at >> scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) >> at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) >> at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) >> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) >> at >> scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643) >> at >> scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658) >> at >> scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) >> at >> scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) >> at >> scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72) >> at >> scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634) >> at >> scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) >> at >> scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685) >> at >> scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) >> at >> scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) >> at scala.concurrent.Promise$class.complete(Promise.scala:55) >> at >> scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) >> at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) >> at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) >> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) >> at >> org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) >> at >> scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) >> at >> scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) >> at >> scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) >> at >> akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334) >> at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117) >> at >> scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) >> at >> scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691) >> at >> akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467) >> at >> akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419) >> at >> akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423) >> at >> akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375) >> at java.lang.Thread.run(Thread.java:745) >> Exception in thread "main" org.apache.spark.SparkException: Job cancelled >> because SparkContext was shut down >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703) >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702) >> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) >> at >> org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702) >> at >> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514) >> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84) >> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438) >> at >> org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724) >> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185) >> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723) >> at >> org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146) >> at >> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) >> at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1063) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at org.apache.spark.rdd.RDD.fold(RDD.scala:1057) >> at >> org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:34) >> at >> org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34) >> at >> org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:33) >> at >> com.radius.core.util.SparkUtils$.estimateNewPartitionsNum(SparkUtils.scala:41) >> at com.radius.core.util.SparkUtils$.coalesceRdd(SparkUtils.scala:35) >> at com.radius.distiller.Distiller.saveExtract(Distiller.scala:75) >> at com.radius.distiller.Execute$.run(Execute.scala:55) >> at com.radius.distiller.Execute$.main(Execute.scala:29) >> at com.radius.distiller.Execute.main(Execute.scala) >> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >> at >> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) >> at >> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >> at java.lang.reflect.Method.invoke(Method.java:606) >> at >> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) >> at >> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) >> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) >> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >> Command exiting with ret '1' >> >> >> On Sun, Dec 20, 2015 at 11:29 AM, Alexander Pivovarov < >> apivova...@gmail.com> wrote: >> >>> Or this message >>> >>> Exception in thread "main" org.apache.spark.SparkException: Job cancelled >>> because SparkContext was shut down >>> at >>> org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703) >>> at >>> org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702) >>> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) >>> at >>> org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702) >>> at >>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514) >>> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84) >>> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438) >>> at >>> org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724) >>> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185) >>> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723) >>> at >>> org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146) >>> at >>> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at >>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at >>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at >>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) >>> at >>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) >>> at >>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >>> at >>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) >>> at >>> com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) >>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) >>> at com.radius.distiller.Execute$.run(Execute.scala:56) >>> at com.radius.distiller.Execute$.main(Execute.scala:33) >>> at com.radius.distiller.Execute.main(Execute.scala) >>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >>> at >>> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) >>> at >>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >>> at java.lang.reflect.Method.invoke(Method.java:606) >>> at >>> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) >>> at >>> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) >>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) >>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >>> >>> >>> On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov < >>> apivova...@gmail.com> wrote: >>> >>>> it can also fail with the following message >>>> >>>> Exception in thread "main" org.apache.spark.SparkException: Job aborted >>>> due to stage failure: Task 133 in stage 33.1 failed 4 times, most recent >>>> failure: Lost task 133.3 in stage 33.1 (TID 172737, >>>> ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to >>>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>>> at >>>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) >>>> at >>>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) >>>> at >>>> org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) >>>> at >>>> org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) >>>> at >>>> org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) >>>> at >>>> org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) >>>> at >>>> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) >>>> at java.util.concurrent.FutureTask.run(FutureTask.java:262) >>>> at >>>> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) >>>> at >>>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) >>>> at java.lang.Thread.run(Thread.java:745) >>>> Caused by: java.net.ConnectException: Connection refused: >>>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) >>>> at >>>> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) >>>> at >>>> io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) >>>> at >>>> io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) >>>> at >>>> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) >>>> at >>>> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) >>>> at >>>> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) >>>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) >>>> at >>>> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) >>>> ... 1 more >>>> >>>> Driver stacktrace: >>>> at >>>> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270) >>>> at >>>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) >>>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) >>>> at scala.Option.foreach(Option.scala:236) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697) >>>> at >>>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496) >>>> at >>>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458) >>>> at >>>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447) >>>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) >>>> at >>>> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) >>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) >>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) >>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>>> at >>>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) >>>> at >>>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) >>>> at >>>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >>>> at >>>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>>> at >>>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) >>>> at >>>> com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) >>>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) >>>> at com.radius.distiller.Execute$.run(Execute.scala:56) >>>> at com.radius.distiller.Execute$.main(Execute.scala:33) >>>> at com.radius.distiller.Execute.main(Execute.scala) >>>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >>>> at >>>> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) >>>> at >>>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >>>> at java.lang.reflect.Method.invoke(Method.java:606) >>>> at >>>> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) >>>> at >>>> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) >>>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >>>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) >>>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >>>> Caused by: java.io.IOException: Failed to connect to >>>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>>> at >>>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) >>>> at >>>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) >>>> at >>>> org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) >>>> at >>>> org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) >>>> at >>>> org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) >>>> at >>>> org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) >>>> at >>>> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) >>>> at java.util.concurrent.FutureTask.run(FutureTask.java:262) >>>> at >>>> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) >>>> at >>>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) >>>> at java.lang.Thread.run(Thread.java:745) >>>> Caused by: java.net.ConnectException: Connection refused: >>>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) >>>> at >>>> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) >>>> at >>>> io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) >>>> at >>>> io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) >>>> at >>>> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) >>>> at >>>> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) >>>> at >>>> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) >>>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) >>>> at >>>> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) >>>> ... 1 more >>>> >>>> >>>> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov < >>>> apivova...@gmail.com> wrote: >>>> >>>>> I run Spark 1.5.2 on YARN (EMR) >>>>> >>>>> I noticed that my long running jobs always failed after 1h 40 min >>>>> (6000s) with the exceptions below. >>>>> >>>>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by >>>>> default >>>>> >>>>> I changed the settings to the following and it solve my issue. >>>>> >>>>> "spark.akka.heartbeat.pauses": "60000s", >>>>> "spark.akka.heartbeat.interval": "10000s" >>>>> >>>>> >>>>> >>>>> RROR ErrorMonitor - Uncaught fatal error from thread >>>>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>>>> ActorSystem [sparkDriver] >>>>> java.lang.OutOfMemoryError: Java heap space >>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) >>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) >>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>>>> at >>>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>>> at >>>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>>> at >>>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>>>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>>>> ActorSystem [sparkDriver] >>>>> java.lang.OutOfMemoryError: Java heap space >>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) >>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) >>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>>>> at >>>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>>> at >>>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>>> at >>>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>>>> [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down >>>>> ActorSystem [sparkDriver] >>>>> java.lang.OutOfMemoryError: Java heap space >>>>> at java.util.Arrays.copyOf(Arrays.java:2271) >>>>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118) >>>>> at >>>>> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) >>>>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) >>>>> at >>>>> java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876) >>>>> at >>>>> java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785) >>>>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188) >>>>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) >>>>> at >>>>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129) >>>>> at >>>>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) >>>>> at >>>>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) >>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129) >>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>>>> at >>>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>>> at >>>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>>> at >>>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>>>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>>>> ActorSystem [sparkDriver] >>>>> java.lang.OutOfMemoryError: Java heap space >>>>> at >>>>> com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) >>>>> at >>>>> akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138) >>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740) >>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>>> at >>>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>>> >>>>> at >>>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>>> >>>>> >>>> >>>> >>> >>