This can happen if the file size is 0 On Mon, Apr 29, 2019 at 2:28 PM Prateek Rajput <prateek.raj...@flipkart.com.invalid> wrote:
> Hi guys, > I am getting this strange error again and again while reading from from a > sequence file in spark. > User class threw exception: org.apache.spark.SparkException: Job aborted. > at > org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958) > at > org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957) > at > org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1499) > at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478) > at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) > at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1478) > at > org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550) > at > org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45) > at > com.flipkart.prognos.spark.UniqueDroppedFSN.main(UniqueDroppedFSN.java:42) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:678) > Caused by: org.apache.spark.SparkException: Job aborted due to stage > failure: Task 186 in stage 0.0 failed 4 times, most recent failure: Lost > task 186.3 in stage 0.0 (TID 179, prod-fdphadoop-krios-dn-1039, executor > 1): java.io.EOFException > at java.io.DataInputStream.readFully(DataInputStream.java:197) > at > org.apache.hadoop.io.DataOutputBuffer$Buffer.write(DataOutputBuffer.java:70) > at org.apache.hadoop.io.DataOutputBuffer.write(DataOutputBuffer.java:120) > at org.apache.hadoop.io.SequenceFile$Reader.next(SequenceFile.java:2436) > at org.apache.hadoop.io.SequenceFile$Reader.next(SequenceFile.java:2568) > at > org.apache.hadoop.mapred.SequenceFileRecordReader.next(SequenceFileRecordReader.java:82) > at org.apache.spark.rdd.HadoopRDD$$anon$1.getNext(HadoopRDD.scala:293) > at org.apache.spark.rdd.HadoopRDD$$anon$1.getNext(HadoopRDD.scala:224) > at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73) > at > org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) > at > org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:191) > at > org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) > at org.apache.spark.scheduler.Task.run(Task.scala:121) > at > org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Driver stacktrace: > at org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114) > at > org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:78) > ... 36 more > Caused by: java.io.EOFException > at java.io.DataInputStream.readFully(DataInputStream.java:197) > at > org.apache.hadoop.io.DataOutputBuffer$Buffer.write(DataOutputBuffer.java:70) > at org.apache.hadoop.io.DataOutputBuffer.write(DataOutputBuffer.java:120) > at org.apache.hadoop.io.SequenceFile$Reader.next(SequenceFile.java:2436) > at org.apache.hadoop.io.SequenceFile$Reader.next(SequenceFile.java:2568) > at > org.apache.hadoop.mapred.SequenceFileRecordReader.next(SequenceFileRecordReader.java:82) > at org.apache.spark.rdd.HadoopRDD$$anon$1.getNext(HadoopRDD.scala:293) > at org.apache.spark.rdd.HadoopRDD$$anon$1.getNext(HadoopRDD.scala:224) > at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73) > at > org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) > at > org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:191) > at > org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) > at org.apache.spark.scheduler.Task.run(Task.scala:121) > at > org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > > My code block is - > > JavaRDD<Tuple2<String, String>> distinctFSN = sc.sequenceFile(inputFile, > Text.class, Text.class) > .map(x -> new Tuple2<>(x._1.toString(), x._2.toString())).distinct(); > > JavaRDD<String> orderEventsFSN = distinctFSN > .filter(x -> x._1().equals(ORDER_EVENTS)) > .map(Tuple2::_2); > > JavaRDD<String> ravenEventsFSN = distinctFSN > .filter(x -> x._1().equals(RAVEN_EVENTS)) > .map(Tuple2::_2); > orderEventsFSN.coalesce(1).saveAsTextFile(outputDir + DIRECTORY_SEPARATOR + > ORDER_EVENTS); > ravenEventsFSN.coalesce(1).saveAsTextFile(outputDir + DIRECTORY_SEPARATOR + > RAVEN_EVENTS); > > Please help with this issue, if someone have also faced it. > > SPARK-JAVA VERSION - 2.4.0 > JDK VERSION - 1.8 > SPARK ARTIFACTID - spark-core_2.11 > > Regards, > Prateek > > -- Thanks Deepak www.bigdatabig.com www.keosha.net