Hi, I have several Spark jobs including both batch job and Stream jobs to process the system log and analyze them. We are using Kafka as the pipeline to connect each jobs.
Once upgrade to Spark 2.1.0 + Spark Kafka Streaming 010, I found some of the jobs(both batch or streaming) are thrown below exceptions randomly(either after several hours run or just run in 20 mins). Can anyone give me some suggestions about how to figure out the real root cause? (Looks like google result is not very useful...) Thanks, Martin 00:30:04,510 WARN - 17/07/22 00:30:04 WARN TaskSetManager: Lost task 60.0 in stage 1518490.0 (TID 338070, 10.133.96.21, executor 0): java.io.FileNotFoundException: /mnt/mesos/work_dir/slaves/20160924-021501-274760970-5050-7646-S2/frameworks/40aeb8e5-e82a-4df9-b034-8815a7a7564b-2543/executors/0/runs/fd15c15d-2511-4f37-a106-27431f583153/blockmgr-a0e0e673-f88b-4d12-a802-c35643e6c6b2/33/shuffle_2090_60_0.index.b66235be-79be-4455-9759-1c7ba70f91f6 (No such file or directory) 00:30:04,510 WARN - at java.io.FileOutputStream.open0(Native Method) 00:30:04,510 WARN - at java.io.FileOutputStream.open(FileOutputStream.java:270) 00:30:04,510 WARN - at java.io.FileOutputStream.<init>(FileOutputStream.java:213) 00:30:04,510 WARN - at java.io.FileOutputStream.<init>(FileOutputStream.java:162) 00:30:04,510 WARN - at org.apache.spark.shuffle.IndexShuffleBlockResolver.writeIndexFileAndCommit(IndexShuffleBlockResolver.scala:144) 00:30:04,510 WARN - at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:128) 00:30:04,510 WARN - at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) 00:30:04,510 WARN - at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) 00:30:04,510 WARN - at org.apache.spark.scheduler.Task.run(Task.scala:99) 00:30:04,510 WARN - at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) 00:30:04,510 WARN - at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 00:30:04,510 WARN - at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 00:30:04,510 WARN - at java.lang.Thread.run(Thread.java:748) 00:30:04,580 INFO - Driver stacktrace: 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.org $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) 00:30:04,580 INFO - scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) 00:30:04,580 INFO - scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) 00:30:04,580 INFO - scala.Option.foreach(Option.scala:257) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) 00:30:04,580 INFO - org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) 00:30:04,580 INFO - org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) 00:30:04,580 INFO - org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) 00:30:04,580 INFO - org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1353) 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) 00:30:04,580 INFO - org.apache.spark.rdd.RDD.withScope(RDD.scala:362) 00:30:04,580 INFO - org.apache.spark.rdd.RDD.take(RDD.scala:1326) 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$anonfun$isEmpty$1.apply$mcZ$sp(RDD.scala:1461) 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$anonfun$isEmpty$1.apply(RDD.scala:1461) 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$anonfun$isEmpty$1.apply(RDD.scala:1461) 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) 00:30:04,580 INFO - org.apache.spark.rdd.RDD.withScope(RDD.scala:362) 00:30:04,580 INFO - org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1460) 00:30:04,580 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment.ClientLogsEnrichmentJob$.executeIteration(ClientLogsEnrichmentJob.scala:133) 00:30:04,580 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment.ClientLogsEnrichmentJob$.runIteration(ClientLogsEnrichmentJob.scala:76) 00:30:04,581 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment.ClientLogsEnrichmentJob$.runBatch(ClientLogsEnrichmentJob.scala:59) 00:30:04,581 INFO - com.ericsson.mediafirst.sparkutils.jobtemplates.BatchJob.main(BatchJob.scala:35) 00:30:04,581 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment.ClientLogsEnrichmentJob.main(ClientLogsEnrichmentJob.scala) 00:30:04,581 INFO - sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 00:30:04,581 INFO - sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 00:30:04,581 INFO - sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 00:30:04,581 INFO - java.lang.reflect.Method.invoke(Method.java:498) 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738) 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187) 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212) 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126) 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) 00:30:04,581 WARN - 17/07/22 00:30:04 WARN JobProgressListener: Task start for unknown stage 1518491 00:30:04,670 WARN - 17/07/22 00:30:04 ERROR LiveListenerBus: SparkListenerBus has already stopped! Dropping event SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(0, 10.133.96.21, 45377, None),rdd_15721_0,StorageLevel(memory, deserialized, 1 replicas),12024,0)) 00:30:04,673 WARN - 17/07/22 00:30:04 ERROR LiveListenerBus: SparkListenerBus has already stopped! Dropping event SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(0, 10.133.96.21, 45377, None),rdd_15721_1,StorageLevel(memory, deserialized, 1 replicas),13736,0)) 00:30:04,679 WARN - 17/07/22 00:30:04 ERROR TransportRequestHandler: Error while invoking RpcHandler#receive() for one-way message. 00:30:04,679 WARN - org.apache.spark.SparkException: Could not find CoarseGrainedScheduler. 00:30:04,679 WARN - at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:154) 00:30:04,679 WARN - at org.apache.spark.rpc.netty.Dispatcher.postOneWayMessage(Dispatcher.scala:134) 00:30:04,679 WARN - at org.apache.spark.rpc.netty.NettyRpcHandler.receive(NettyRpcEnv.scala:570) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportRequestHandler.processOneWayMessage(TransportRequestHandler.java:180) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:109) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:119) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:51) 00:30:04,679 WARN - at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:266) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911) 00:30:04,679 WARN - at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:652) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451) 00:30:04,679 WARN - at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140) 00:30:04,679 WARN - at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144) 00:30:04,679 WARN - at java.lang.Thread.run(Thread.java:748) 00:30:04,679 WARN - 17/07/22 00:30:04 ERROR TransportRequestHandler: Error while invoking RpcHandler#receive() for one-way message. 00:30:04,679 WARN - org.apache.spark.SparkException: Could not find CoarseGrainedScheduler. 00:30:04,679 WARN - at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:154) 00:30:04,679 WARN - at org.apache.spark.rpc.netty.Dispatcher.postOneWayMessage(Dispatcher.scala:134) 00:30:04,679 WARN - at org.apache.spark.rpc.netty.NettyRpcHandler.receive(NettyRpcEnv.scala:570) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportRequestHandler.processOneWayMessage(TransportRequestHandler.java:180) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:109) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:119) 00:30:04,679 WARN - at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:51) 00:30:04,679 WARN - at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:266) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 00:30:04,679 WARN - at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911) 00:30:04,679 WARN - at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:652) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489) 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451) 00:30:04,679 WARN - at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140) 00:30:04,679 WARN - at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144) 00:30:04,679 WARN - at java.lang.Thread.run(Thread.java:748) 00:30:11,318 WARN - I0722 00:30:11.318724 2921 sched.cpp:2021] Asked to stop the driver 00:30:11,318 WARN - I0722 00:30:11.318838 2988 sched.cpp:1203] Stopping framework 40aeb8e5-e82a-4df9-b034-8815a7a7564b-2543