Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/18865#discussion_r137930066 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala --- @@ -2034,4 +2034,34 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } } + + test("SPARK-21610: Corrupt records are not handled properly when creating a dataframe " + + "from a file") { + withTempPath { dir => + val path = dir.getCanonicalPath + val data = + """{"field": 1} + |{"field": 2} + |{"field": "3"}""".stripMargin + Seq(data).toDF().repartition(1).write.text(path) + val schema = new StructType().add("field", ByteType).add("_corrupt_record", StringType) + val expectedErrorMsg = "'_corrupt_record' cannot be selected alone" + var msg = intercept[AnalysisException] { + spark.read.schema(schema).json(path).select("_corrupt_record").collect() + }.getMessage + assert(msg.contains(expectedErrorMsg)) + // negative cases + msg = intercept[AnalysisException] { + spark.read.schema(schema).json(path).select("_corrupt_record").show() + }.getMessage + assert(msg.contains(expectedErrorMsg)) + intercept[catalyst.errors.TreeNodeException[_]] { + spark.read.schema(schema).json(path).filter($"_corrupt_record".isNotNull).count() + } + // workaround + val df = spark.read.schema(schema).json(path).cache() + assert(df.filter($"_corrupt_record".isNotNull).count() == 1) + assert(df.filter($"_corrupt_record".isNull).count() == 2) --- End diff -- Please also add another one ``` checkAnswer( spark.read.schema(schema).json(path).select("_corrupt_record"), Row(.... ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org