First,spark will handle task fail so if job ended normally , this error can
be ignore.
Second, when using BypassMergeSortShuffleWriter, it will first write data
file then write an index file.
You can check "Failed to delete temporary index file at" or "fail to rename
file" in related executor node's log file.

2017-07-25 0:33 GMT+08:00 Martin Peng <wei...@gmail.com>:

> Is there anyone at share me some lights about this issue?
>
> Thanks
> Martin
>
> 2017-07-21 18:58 GMT-07:00 Martin Peng <wei...@gmail.com>:
>
>> Hi,
>>
>> I have several Spark jobs including both batch job and Stream jobs to
>> process the system log and analyze them. We are using Kafka as the pipeline
>> to connect each jobs.
>>
>> Once upgrade to Spark 2.1.0 + Spark Kafka Streaming 010, I found some of
>> the jobs(both batch or streaming) are thrown below exceptions
>> randomly(either after several hours run or just run in 20 mins). Can anyone
>> give me some suggestions about how to figure out the real root cause?
>> (Looks like google result is not very useful...)
>>
>> Thanks,
>> Martin
>>
>> 00:30:04,510 WARN  - 17/07/22 00:30:04 WARN TaskSetManager: Lost task
>> 60.0 in stage 1518490.0 (TID 338070, 10.133.96.21, executor 0):
>> java.io.FileNotFoundException: /mnt/mesos/work_dir/slaves/201
>> 60924-021501-274760970-5050-7646-S2/frameworks/40aeb8e5-
>> e82a-4df9-b034-8815a7a7564b-2543/executors/0/runs/
>> fd15c15d-2511-4f37-a106-27431f583153/blockmgr-a0e0e673-f88b-
>> 4d12-a802-c35643e6c6b2/33/shuffle_2090_60_0.index.
>> b66235be-79be-4455-9759-1c7ba70f91f6 (No such file or directory)
>> 00:30:04,510 WARN  -     at java.io.FileOutputStream.open0(Native Method)
>> 00:30:04,510 WARN  -     at java.io.FileOutputStream.open(
>> FileOutputStream.java:270)
>> 00:30:04,510 WARN  -     at java.io.FileOutputStream.<init
>> >(FileOutputStream.java:213)
>> 00:30:04,510 WARN  -     at java.io.FileOutputStream.<init
>> >(FileOutputStream.java:162)
>> 00:30:04,510 WARN  -     at org.apache.spark.shuffle.Index
>> ShuffleBlockResolver.writeIndexFileAndCommit(IndexShuffleBlo
>> ckResolver.scala:144)
>> 00:30:04,510 WARN  -     at org.apache.spark.shuffle.sort.
>> BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:128)
>> 00:30:04,510 WARN  -     at org.apache.spark.scheduler.Shu
>> ffleMapTask.runTask(ShuffleMapTask.scala:96)
>> 00:30:04,510 WARN  -     at org.apache.spark.scheduler.Shu
>> ffleMapTask.runTask(ShuffleMapTask.scala:53)
>> 00:30:04,510 WARN  -     at org.apache.spark.scheduler.Tas
>> k.run(Task.scala:99)
>> 00:30:04,510 WARN  -     at org.apache.spark.executor.Exec
>> utor$TaskRunner.run(Executor.scala:282)
>> 00:30:04,510 WARN  -     at java.util.concurrent.ThreadPoo
>> lExecutor.runWorker(ThreadPoolExecutor.java:1142)
>> 00:30:04,510 WARN  -     at java.util.concurrent.ThreadPoo
>> lExecutor$Worker.run(ThreadPoolExecutor.java:617)
>> 00:30:04,510 WARN  -     at java.lang.Thread.run(Thread.java:748)
>>
>> 00:30:04,580 INFO  - Driver stacktrace:
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAGScheduler.org
>> $apache$spark$scheduler$DAGScheduler$$failJobAn
>> dIndependentStages(DAGScheduler.scala:1435)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
>> 00:30:04,580 INFO  - scala.collection.mutable.Resiz
>> ableArray$class.foreach(ResizableArray.scala:59)
>> 00:30:04,580 INFO  - scala.collection.mutable.Array
>> Buffer.foreach(ArrayBuffer.scala:48)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler.abortStage(DAGScheduler.scala:1422)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
>> 00:30:04,580 INFO  - scala.Option.foreach(Option.scala:257)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler.handleTaskSetFailed(DAGScheduler.scala:802)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> SchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> SchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> SchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
>> 00:30:04,580 INFO  - org.apache.spark.util.EventLoo
>> p$$anon$1.run(EventLoop.scala:48)
>> 00:30:04,580 INFO  - org.apache.spark.scheduler.DAG
>> Scheduler.runJob(DAGScheduler.scala:628)
>> 00:30:04,580 INFO  - org.apache.spark.SparkContext.
>> runJob(SparkContext.scala:1918)
>> 00:30:04,580 INFO  - org.apache.spark.SparkContext.
>> runJob(SparkContext.scala:1931)
>> 00:30:04,580 INFO  - org.apache.spark.SparkContext.
>> runJob(SparkContext.scala:1944)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD$$anon
>> fun$take$1.apply(RDD.scala:1353)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDDOperat
>> ionScope$.withScope(RDDOperationScope.scala:151)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDDOperat
>> ionScope$.withScope(RDDOperationScope.scala:112)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD.take(RDD.scala:1326)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD$$anon
>> fun$isEmpty$1.apply$mcZ$sp(RDD.scala:1461)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD$$anon
>> fun$isEmpty$1.apply(RDD.scala:1461)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD$$anon
>> fun$isEmpty$1.apply(RDD.scala:1461)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDDOperat
>> ionScope$.withScope(RDDOperationScope.scala:151)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDDOperat
>> ionScope$.withScope(RDDOperationScope.scala:112)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
>> 00:30:04,580 INFO  - org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1460)
>> 00:30:04,580 INFO  - com.ericsson.mediafirst.spark.
>> clientlogsenrichment.ClientLogsEnrichmentJob$.executeIterati
>> on(ClientLogsEnrichmentJob.scala:133)
>> 00:30:04,580 INFO  - com.ericsson.mediafirst.spark.
>> clientlogsenrichment.ClientLogsEnrichmentJob$.runIteration(C
>> lientLogsEnrichmentJob.scala:76)
>> 00:30:04,581 INFO  - com.ericsson.mediafirst.spark.
>> clientlogsenrichment.ClientLogsEnrichmentJob$.runBatch(Clien
>> tLogsEnrichmentJob.scala:59)
>> 00:30:04,581 INFO  - com.ericsson.mediafirst.sparku
>> tils.jobtemplates.BatchJob.main(BatchJob.scala:35)
>> 00:30:04,581 INFO  - com.ericsson.mediafirst.spark.
>> clientlogsenrichment.ClientLogsEnrichmentJob.main(ClientLogs
>> EnrichmentJob.scala)
>> 00:30:04,581 INFO  - sun.reflect.NativeMethodAccessorImpl.invoke0(Native
>> Method)
>> 00:30:04,581 INFO  - sun.reflect.NativeMethodAccess
>> orImpl.invoke(NativeMethodAccessorImpl.java:62)
>> 00:30:04,581 INFO  - sun.reflect.DelegatingMethodAc
>> cessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>> 00:30:04,581 INFO  - java.lang.reflect.Method.invoke(Method.java:498)
>> 00:30:04,581 INFO  - org.apache.spark.deploy.SparkS
>> ubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSub
>> mit.scala:738)
>> 00:30:04,581 INFO  - org.apache.spark.deploy.SparkS
>> ubmit$.doRunMain$1(SparkSubmit.scala:187)
>> 00:30:04,581 INFO  - org.apache.spark.deploy.SparkS
>> ubmit$.submit(SparkSubmit.scala:212)
>> 00:30:04,581 INFO  - org.apache.spark.deploy.SparkS
>> ubmit$.main(SparkSubmit.scala:126)
>> 00:30:04,581 INFO  - org.apache.spark.deploy.SparkS
>> ubmit.main(SparkSubmit.scala)
>> 00:30:04,581 WARN  - 17/07/22 00:30:04 WARN JobProgressListener: Task
>> start for unknown stage 1518491
>> 00:30:04,670 WARN  - 17/07/22 00:30:04 ERROR LiveListenerBus:
>> SparkListenerBus has already stopped! Dropping event
>> SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(0,
>> 10.133.96.21, 45377, None),rdd_15721_0,StorageLevel(memory,
>> deserialized, 1 replicas),12024,0))
>> 00:30:04,673 WARN  - 17/07/22 00:30:04 ERROR LiveListenerBus:
>> SparkListenerBus has already stopped! Dropping event
>> SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(0,
>> 10.133.96.21, 45377, None),rdd_15721_1,StorageLevel(memory,
>> deserialized, 1 replicas),13736,0))
>> 00:30:04,679 WARN  - 17/07/22 00:30:04 ERROR TransportRequestHandler:
>> Error while invoking RpcHandler#receive() for one-way message.
>> 00:30:04,679 WARN  - org.apache.spark.SparkException: Could not find
>> CoarseGrainedScheduler.
>> 00:30:04,679 WARN  -     at org.apache.spark.rpc.netty.Dis
>> patcher.postMessage(Dispatcher.scala:154)
>> 00:30:04,679 WARN  -     at org.apache.spark.rpc.netty.Dis
>> patcher.postOneWayMessage(Dispatcher.scala:134)
>> 00:30:04,679 WARN  -     at org.apache.spark.rpc.netty.Net
>> tyRpcHandler.receive(NettyRpcEnv.scala:570)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportRequestHandler.processOneWayMessage(TransportRequ
>> estHandler.java:180)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportRequestHandler.handle(TransportRequestHandler.java:109)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportChannelHandler.channelRead0(TransportChannelHandler.java:119)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportChannelHandler.channelRead0(TransportChannelHandler.java:51)
>> 00:30:04,679 WARN  -     at io.netty.channel.SimpleChannel
>> InboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at io.netty.handler.timeout.IdleS
>> tateHandler.channelRead(IdleStateHandler.java:266)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at io.netty.handler.codec.Message
>> ToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.util.
>> TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at io.netty.channel.DefaultChanne
>> lPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.DefaultChanne
>> lPipeline.fireChannelRead(DefaultChannelPipeline.java:911)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.AbstractN
>> ioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.processSelectedKey(NioEventLoop.java:652)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.processSelectedKeysOptimized(NioEventLoop.java:575)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.processSelectedKeys(NioEventLoop.java:489)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.run(NioEventLoop.java:451)
>> 00:30:04,679 WARN  -     at io.netty.util.concurrent.Singl
>> eThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140)
>> 00:30:04,679 WARN  -     at io.netty.util.concurrent.Defau
>> ltThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFa
>> ctory.java:144)
>> 00:30:04,679 WARN  -     at java.lang.Thread.run(Thread.java:748)
>> 00:30:04,679 WARN  - 17/07/22 00:30:04 ERROR TransportRequestHandler:
>> Error while invoking RpcHandler#receive() for one-way message.
>> 00:30:04,679 WARN  - org.apache.spark.SparkException: Could not find
>> CoarseGrainedScheduler.
>> 00:30:04,679 WARN  -     at org.apache.spark.rpc.netty.Dis
>> patcher.postMessage(Dispatcher.scala:154)
>> 00:30:04,679 WARN  -     at org.apache.spark.rpc.netty.Dis
>> patcher.postOneWayMessage(Dispatcher.scala:134)
>> 00:30:04,679 WARN  -     at org.apache.spark.rpc.netty.Net
>> tyRpcHandler.receive(NettyRpcEnv.scala:570)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportRequestHandler.processOneWayMessage(TransportRequ
>> estHandler.java:180)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportRequestHandler.handle(TransportRequestHandler.java:109)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportChannelHandler.channelRead0(TransportChannelHandler.java:119)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.serve
>> r.TransportChannelHandler.channelRead0(TransportChannelHandler.java:51)
>> 00:30:04,679 WARN  -     at io.netty.channel.SimpleChannel
>> InboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at io.netty.handler.timeout.IdleS
>> tateHandler.channelRead(IdleStateHandler.java:266)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at io.netty.handler.codec.Message
>> ToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at org.apache.spark.network.util.
>> TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
>> 00:30:04,679 WARN  -     at io.netty.channel.DefaultChanne
>> lPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:367)
>> 00:30:04,679 WARN  -     at io.netty.channel.AbstractChann
>> elHandlerContext.invokeChannelRead(AbstractChannelHandlerCon
>> text.java:353)
>> 00:30:04,679 WARN  -     at io.netty.channel.DefaultChanne
>> lPipeline.fireChannelRead(DefaultChannelPipeline.java:911)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.AbstractN
>> ioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.processSelectedKey(NioEventLoop.java:652)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.processSelectedKeysOptimized(NioEventLoop.java:575)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.processSelectedKeys(NioEventLoop.java:489)
>> 00:30:04,679 WARN  -     at io.netty.channel.nio.NioEventL
>> oop.run(NioEventLoop.java:451)
>> 00:30:04,679 WARN  -     at io.netty.util.concurrent.Singl
>> eThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140)
>> 00:30:04,679 WARN  -     at io.netty.util.concurrent.Defau
>> ltThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFa
>> ctory.java:144)
>> 00:30:04,679 WARN  -     at java.lang.Thread.run(Thread.java:748)
>> 00:30:11,318 WARN  - I0722 00:30:11.318724 2921 sched.cpp:2021] Asked to
>> stop the driver
>> 00:30:11,318 WARN  - I0722 00:30:11.318838 2988 sched.cpp:1203] Stopping
>> framework 40aeb8e5-e82a-4df9-b034-8815a7a7564b-2543
>>
>
>

Reply via email to