Did you encounter similar error on a smaller dataset ? Which release of Spark are you using ?
Is it possible you have an incompatible snappy version somewhere in your classpath ? Thanks On Fri, Apr 8, 2016 at 12:36 PM, entee <ntilm...@gmail.com> wrote: > I'm trying to do a relatively large join (0.5TB shuffle read/write) and > just > calling a count (or show) on the dataframe fails to complete, getting to > the > last task before failing: > > Py4JJavaError: An error occurred while calling o133.count. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 > in stage 11.0 failed 4 times, most recent failure: Lost task 5.3 in stage > 11.0 (TID 7836, xxxx.com): java.io.IOException: failed to uncompress the > chunk: PARSING_ERROR(2) > > (full stack trace below) > > I'm not sure why this would happen, any ideas about whether our > configuration is off or how to fix this? > > Nicolas > > > > Py4JJavaError: An error occurred while calling o133.count. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 > in stage 11.0 failed 4 times, most recent failure: Lost task 5.3 in stage > 11.0 (TID 7836, xxxx.com): java.io.IOException: failed to uncompress the > chunk: PARSING_ERROR(2) > at > > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:361) > at > org.xerial.snappy.SnappyInputStream.rawRead(SnappyInputStream.java:158) > at > org.xerial.snappy.SnappyInputStream.read(SnappyInputStream.java:142) > at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) > at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) > at java.io.BufferedInputStream.read(BufferedInputStream.java:345) > at java.io.DataInputStream.read(DataInputStream.java:149) > at > org.spark-project.guava.io.ByteStreams.read(ByteStreams.java:899) > at > org.spark-project.guava.io.ByteStreams.readFully(ByteStreams.java:733) > at > > org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$3$$anon$1.next(UnsafeRowSerializer.scala:119) > at > > org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$3$$anon$1.next(UnsafeRowSerializer.scala:102) > at scala.collection.Iterator$$anon$12.next(Iterator.scala:397) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) > at > org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:30) > at > org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) > at > > org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:168) > at > org.apache.spark.sql.execution.Sort$$anonfun$1.apply(Sort.scala:90) > at > org.apache.spark.sql.execution.Sort$$anonfun$1.apply(Sort.scala:64) > at > > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728) > at > > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at > > org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:88) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at > > org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:96) > at > > org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:95) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:396) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369) > at > > org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:163) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) > at org.apache.spark.scheduler.Task.run(Task.scala:89) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) > at > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418) > at > > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at > scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) > at scala.Option.foreach(Option.scala:257) > at > > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799) > at > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640) > at > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) > at > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927) > at > > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > at > > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) > at org.apache.spark.rdd.RDD.collect(RDD.scala:926) > at > > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:166) > at > > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:174) > at > > org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$execute$1$1.apply(DataFrame.scala:1499) > at > > org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$execute$1$1.apply(DataFrame.scala:1499) > at > > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56) > at > org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086) > at > org.apache.spark.sql.DataFrame.org > $apache$spark$sql$DataFrame$$execute$1(DataFrame.scala:1498) > at > org.apache.spark.sql.DataFrame.org > $apache$spark$sql$DataFrame$$collect(DataFrame.scala:1505) > at > org.apache.spark.sql.DataFrame$$anonfun$count$1.apply(DataFrame.scala:1515) > at > org.apache.spark.sql.DataFrame$$anonfun$count$1.apply(DataFrame.scala:1514) > at > org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099) > at org.apache.spark.sql.DataFrame.count(DataFrame.scala:1514) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > at > py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) > at py4j.Gateway.invoke(Gateway.java:259) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:209) > at java.lang.Thread.run(Thread.java:745) > Caused by: java.io.IOException: failed to uncompress the chunk: > PARSING_ERROR(2) > at > > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:361) > at > org.xerial.snappy.SnappyInputStream.rawRead(SnappyInputStream.java:158) > at > org.xerial.snappy.SnappyInputStream.read(SnappyInputStream.java:142) > at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) > at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) > at java.io.BufferedInputStream.read(BufferedInputStream.java:345) > at java.io.DataInputStream.read(DataInputStream.java:149) > at > org.spark-project.guava.io.ByteStreams.read(ByteStreams.java:899) > at > org.spark-project.guava.io.ByteStreams.readFully(ByteStreams.java:733) > at > > org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$3$$anon$1.next(UnsafeRowSerializer.scala:119) > at > > org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$3$$anon$1.next(UnsafeRowSerializer.scala:102) > at scala.collection.Iterator$$anon$12.next(Iterator.scala:397) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) > at > org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:30) > at > org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) > at > > org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:168) > at > org.apache.spark.sql.execution.Sort$$anonfun$1.apply(Sort.scala:90) > at > org.apache.spark.sql.execution.Sort$$anonfun$1.apply(Sort.scala:64) > at > > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728) > at > > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at > > org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:88) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at > > org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:96) > at > > org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:95) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:396) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369) > at > > org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:163) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) > at org.apache.spark.scheduler.Task.run(Task.scala:89) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) > at > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > ... 1 more > > > > -- > View this message in context: > http://apache-spark-user-list.1001560.n3.nabble.com/DataFrame-job-fails-on-parsing-error-help-tp26724.html > Sent from the Apache Spark User List mailing list archive at Nabble.com. > > --------------------------------------------------------------------- > To unsubscribe, e-mail: user-unsubscr...@spark.apache.org > For additional commands, e-mail: user-h...@spark.apache.org > >