Repository: spark Updated Branches: refs/heads/master f1fca81b1 -> dbf3e298a
[SPARK-18764][CORE] Add a warning log when skipping a corrupted file ## What changes were proposed in this pull request? It's better to add a warning log when skipping a corrupted file. It will be helpful when we want to finish the job first, then find them in the log and fix these files. ## How was this patch tested? Jenkins Author: Shixiong Zhu <shixi...@databricks.com> Closes #16192 from zsxwing/SPARK-18764. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbf3e298 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbf3e298 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbf3e298 Branch: refs/heads/master Commit: dbf3e298a1a35c0243f087814ddf88034ff96d66 Parents: f1fca81 Author: Shixiong Zhu <shixi...@databricks.com> Authored: Wed Dec 7 10:30:05 2016 -0800 Committer: Shixiong Zhu <shixi...@databricks.com> Committed: Wed Dec 7 10:30:05 2016 -0800 ---------------------------------------------------------------------- core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +++- core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 6 +++++- .../apache/spark/sql/execution/datasources/FileScanRDD.scala | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/dbf3e298/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index e3d81a6..6e87233 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -259,7 +259,9 @@ class HadoopRDD[K, V]( try { finished = !reader.next(key, value) } catch { - case e: IOException if ignoreCorruptFiles => finished = true + case e: IOException if ignoreCorruptFiles => + logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e) + finished = true } if (!finished) { inputMetrics.incRecordsRead(1) http://git-wip-us.apache.org/repos/asf/spark/blob/dbf3e298/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index e90e84c..e805192 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -189,7 +189,11 @@ class NewHadoopRDD[K, V]( try { finished = !reader.nextKeyValue } catch { - case e: IOException if ignoreCorruptFiles => finished = true + case e: IOException if ignoreCorruptFiles => + logWarning( + s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}", + e) + finished = true } if (finished) { // Close and release the reader here; close() will also be called when the task http://git-wip-us.apache.org/repos/asf/spark/blob/dbf3e298/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 306dc65..6d8cd81 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -139,6 +139,7 @@ class FileScanRDD( } } catch { case e: IOException => + logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e) finished = true null } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org