spark git commit: [SPARK-18764][CORE] Add a warning log when skipping a corrupted file
Repository: spark Updated Branches: refs/heads/branch-2.1 5dbcd4fcf -> acb6ac5da [SPARK-18764][CORE] Add a warning log when skipping a corrupted file ## What changes were proposed in this pull request? It's better to add a warning log when skipping a corrupted file. It will be helpful when we want to finish the job first, then find them in the log and fix these files. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16192 from zsxwing/SPARK-18764. (cherry picked from commit dbf3e298a1a35c0243f087814ddf88034ff96d66) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acb6ac5d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acb6ac5d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acb6ac5d Branch: refs/heads/branch-2.1 Commit: acb6ac5da7a5694cc3270772c6d68933b7d761dc Parents: 5dbcd4f Author: Shixiong Zhu Authored: Wed Dec 7 10:30:05 2016 -0800 Committer: Shixiong Zhu Committed: Wed Dec 7 10:30:15 2016 -0800 -- core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +++- core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala| 6 +- .../apache/spark/sql/execution/datasources/FileScanRDD.scala | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/acb6ac5d/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index ae4320d..3133a28 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -257,7 +257,9 @@ class HadoopRDD[K, V]( try { finished = !reader.next(key, value) } catch { - case e: IOException if ignoreCorruptFiles => finished = true + case e: IOException if ignoreCorruptFiles => +logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e) +finished = true } if (!finished) { inputMetrics.incRecordsRead(1) http://git-wip-us.apache.org/repos/asf/spark/blob/acb6ac5d/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index c783e13..c6ddb4b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -186,7 +186,11 @@ class NewHadoopRDD[K, V]( try { finished = !reader.nextKeyValue } catch { -case e: IOException if ignoreCorruptFiles => finished = true +case e: IOException if ignoreCorruptFiles => + logWarning( +s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}", +e) + finished = true } if (finished) { // Close and release the reader here; close() will also be called when the task http://git-wip-us.apache.org/repos/asf/spark/blob/acb6ac5d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 8994457..237cdab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -138,6 +138,7 @@ class FileScanRDD( } } catch { case e: IOException => + logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e) finished = true null } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18764][CORE] Add a warning log when skipping a corrupted file
Repository: spark Updated Branches: refs/heads/master f1fca81b1 -> dbf3e298a [SPARK-18764][CORE] Add a warning log when skipping a corrupted file ## What changes were proposed in this pull request? It's better to add a warning log when skipping a corrupted file. It will be helpful when we want to finish the job first, then find them in the log and fix these files. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16192 from zsxwing/SPARK-18764. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbf3e298 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbf3e298 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbf3e298 Branch: refs/heads/master Commit: dbf3e298a1a35c0243f087814ddf88034ff96d66 Parents: f1fca81 Author: Shixiong Zhu Authored: Wed Dec 7 10:30:05 2016 -0800 Committer: Shixiong Zhu Committed: Wed Dec 7 10:30:05 2016 -0800 -- core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +++- core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala| 6 +- .../apache/spark/sql/execution/datasources/FileScanRDD.scala | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dbf3e298/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index e3d81a6..6e87233 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -259,7 +259,9 @@ class HadoopRDD[K, V]( try { finished = !reader.next(key, value) } catch { - case e: IOException if ignoreCorruptFiles => finished = true + case e: IOException if ignoreCorruptFiles => +logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e) +finished = true } if (!finished) { inputMetrics.incRecordsRead(1) http://git-wip-us.apache.org/repos/asf/spark/blob/dbf3e298/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index e90e84c..e805192 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -189,7 +189,11 @@ class NewHadoopRDD[K, V]( try { finished = !reader.nextKeyValue } catch { -case e: IOException if ignoreCorruptFiles => finished = true +case e: IOException if ignoreCorruptFiles => + logWarning( +s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}", +e) + finished = true } if (finished) { // Close and release the reader here; close() will also be called when the task http://git-wip-us.apache.org/repos/asf/spark/blob/dbf3e298/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 306dc65..6d8cd81 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -139,6 +139,7 @@ class FileScanRDD( } } catch { case e: IOException => + logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e) finished = true null } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org