[GitHub] spark pull request #22611: [SPARK-25595] Ignore corrupt Avro files if flag I...

HyukjinKwon Tue, 02 Oct 2018 20:01:57 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22611#discussion_r222169616
  
    --- Diff: 
external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala ---
    @@ -342,6 +342,53 @@ class AvroSuite extends QueryTest with 
SharedSQLContext with SQLTestUtils {
         }
       }
     
    +  private def createDummyCorruptFile(dir: File): Unit = {
    +    FileUtils.forceMkdir(dir)
    +    val corruptFile = new File(dir, "corrupt.avro")
    +    val writer = new BufferedWriter(new FileWriter(corruptFile))
    +    writer.write("corrupt")
    +    writer.close()
    +  }
    +
    +  test("Ignore corrupt Avro file if flag IGNORE_CORRUPT_FILES enabled") {
    +    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
    +      withTempPath { dir =>
    +        createDummyCorruptFile(dir)
    +        val message = intercept[FileNotFoundException] {
    +          spark.read.format("avro").load(dir.getAbsolutePath).schema
    +        }.getMessage
    +        assert(message.contains("No Avro files found."))
    +
    +        val srcFile = new File("src/test/resources/episodes.avro")
    +        val destFile = new File(dir, "episodes.avro")
    +        FileUtils.copyFile(srcFile, destFile)
    +
    +        val df = spark.read.format("avro").load(srcFile.getAbsolutePath)
    +        val schema = df.schema
    +        val result = df.collect()
    +        // Schema inference picks random readable sample file.
    +        // Here we use a loop to eliminate randomness.
    +        (1 to 5).foreach { _ =>
    +          
assert(spark.read.format("avro").load(dir.getAbsolutePath).schema == schema)
    +          checkAnswer(spark.read.format("avro").load(dir.getAbsolutePath), 
result)
    +        }
    +      }
    +    }
    +  }
    +
    +  test("Throws IOException on reading corrupt Avro file if flag 
IGNORE_CORRUPT_FILES disabled") {
    +    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
    +      withTempPath { dir =>
    +        createDummyCorruptFile(dir)
    +        val message = intercept[org.apache.spark.SparkException] {
    +          spark.read.format("avro").load(dir.getAbsolutePath).schema
    --- End diff --
    
    `.schema` wouldn't probably be needed.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #22611: [SPARK-25595] Ignore corrupt Avro files if flag I...

Reply via email to