I'm getting DFS closed channel exception every now and then when I run
checkpoint. I do checkpointing every 15 minutes or so. This happens usually
after running the job for 1~2 hours. Anyone seen this before?

Job aborted due to stage failure: Task 6 in stage 70.0 failed 4 times,
most recent failure: Lost task 6.3 in stage 70.0 (TID 1264,
alpinenode7.alpinenow.local):
java.nio.channels.ClosedChannelException:
        
org.apache.hadoop.hdfs.DFSOutputStream.checkClosed(DFSOutputStream.java:1526)
        org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:98)
        
org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:58)
        java.io.DataOutputStream.write(DataOutputStream.java:107)
        
java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
        
java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
        
java.io.ObjectOutputStream.writeNonProxyDesc(ObjectOutputStream.java:1285)
        java.io.ObjectOutputStream.writeClassDesc(ObjectOutputStream.java:1230)
        
java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1426)
        java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
        
java.io.ObjectOutputStream.writeFatalException(ObjectOutputStream.java:1576)
        java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:350)
        
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:42)
        
org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:110)
        org.apache.spark.rdd.CheckpointRDD$.writeToFile(CheckpointRDD.scala:114)
        
org.apache.spark.rdd.RDDCheckpointData$$anonfun$doCheckpoint$1.apply(RDDCheckpointData.scala:95)
        
org.apache.spark.rdd.RDDCheckpointData$$anonfun$doCheckpoint$1.apply(RDDCheckpointData.scala:95)
        org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
        org.apache.spark.scheduler.Task.run(Task.scala:54)
        org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:177)
        
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        java.lang.Thread.run(Thread.java:744)

Reply via email to