[ https://issues.apache.org/jira/browse/SPARK-42288?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17683245#comment-17683245 ]
Apache Spark commented on SPARK-42288: -------------------------------------- User 'Yikf' has created a pull request for this issue: https://github.com/apache/spark/pull/39858 > Expose file path if reading failed > ---------------------------------- > > Key: SPARK-42288 > URL: https://issues.apache.org/jira/browse/SPARK-42288 > Project: Spark > Issue Type: Improvement > Components: SQL > Affects Versions: 3.4.0 > Reporter: Yi kaifei > Priority: Minor > > `MalformedInputException` may be thrown because the decompression failed when > reading the file. In this case, the error message does not contain the file > name. If the file name is included, it is easier to locate the problem. > {code:java} > org.apache.spark.SparkException: Job aborted due to stage failure: Task 41 in > stage 15641.0 failed 10 times, most recent failure: Lost task 41.9 in stage > 15641.0 (TID 6287211) (hostname executor 58): > io.airlift.compress.MalformedInputException: Malformed input: offset=65075 > at > io.airlift.compress.snappy.SnappyRawDecompressor.uncompressAll(SnappyRawDecompressor.java:108) > at > io.airlift.compress.snappy.SnappyRawDecompressor.decompress(SnappyRawDecompressor.java:53) > at > io.airlift.compress.snappy.SnappyDecompressor.decompress(SnappyDecompressor.java:45) > at > org.apache.orc.impl.AircompressorCodec.decompress(AircompressorCodec.java:94) > at org.apache.orc.impl.SnappyCodec.decompress(SnappyCodec.java:45) > at > org.apache.orc.impl.InStream$CompressedStream.readHeader(InStream.java:495) > at > org.apache.orc.impl.InStream$CompressedStream.ensureUncompressed(InStream.java:522) > at org.apache.orc.impl.InStream$CompressedStream.read(InStream.java:509) > at > org.apache.orc.impl.SerializationUtils.readRemainingLongs(SerializationUtils.java:1102) > at > org.apache.orc.impl.SerializationUtils.unrolledUnPackBytes(SerializationUtils.java:1094) > at > org.apache.orc.impl.SerializationUtils.unrolledUnPack32(SerializationUtils.java:1059) > at > org.apache.orc.impl.SerializationUtils.readInts(SerializationUtils.java:925) > at > org.apache.orc.impl.RunLengthIntegerReaderV2.readDirectValues(RunLengthIntegerReaderV2.java:268) > at > org.apache.orc.impl.RunLengthIntegerReaderV2.readValues(RunLengthIntegerReaderV2.java:69) > at > org.apache.orc.impl.RunLengthIntegerReaderV2.next(RunLengthIntegerReaderV2.java:323) > at > org.apache.orc.impl.RunLengthIntegerReaderV2.nextVector(RunLengthIntegerReaderV2.java:373) > at > org.apache.orc.impl.TreeReaderFactory$LongTreeReader.nextVector(TreeReaderFactory.java:641) > at > org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextBatch(TreeReaderFactory.java:2047) > at > org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1219) > at > org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.nextBatch(OrcColumnarBatchReader.java:197) > at > org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.nextKeyValue(OrcColumnarBatchReader.java:99) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93) > at > org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:522) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.columnartorow_nextBatch_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.agg_doAggregateWithKeys_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:179) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:131) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:510) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:513) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:829) > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org