spark git commit: [SPARK-23094] Fix invalid character handling in JsonDataSource

gurwls223 Thu, 18 Jan 2018 14:36:19 -0800

Repository: spark
Updated Branches:
  refs/heads/master f568e9cf7 -> e01919e83



[SPARK-23094] Fix invalid character handling in JsonDataSource

## What changes were proposed in this pull request?

There were two related fixes regarding `from_json`, `get_json_object` and 
`json_tuple` ([Fix 
#1](https://github.com/apache/spark/commit/c8803c06854683c8761fdb3c0e4c55d5a9e22a95),
 [Fix 
#2](https://github.com/apache/spark/commit/86174ea89b39a300caaba6baffac70f3dc702788)),
 but they weren't comprehensive it seems. I wanted to extend those fixes to all 
the parsers, and add tests for each case.

## How was this patch tested?

Regression tests

Author: Burak Yavuz <brk...@gmail.com>

Closes #20302 from brkyvz/json-invfix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e01919e8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e01919e8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e01919e8

Branch: refs/heads/master
Commit: e01919e834d301e13adc8919932796ebae900576
Parents: f568e9c
Author: Burak Yavuz <brk...@gmail.com>
Authored: Fri Jan 19 07:36:06 2018 +0900
Committer: hyukjinkwon <gurwls...@gmail.com>
Committed: Fri Jan 19 07:36:06 2018 +0900

----------------------------------------------------------------------
 .../sql/catalyst/json/CreateJacksonParser.scala |  5 +--
 .../sql/sources/JsonHadoopFsRelationSuite.scala | 34 ++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e01919e8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
index 025a388..b1672e7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends 
Serializable {
   }
 
   def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-    jsonFactory.createParser(record.getBytes, 0, record.getLength)
+    val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
+    jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
   }
 
   def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = 
{
-    jsonFactory.createParser(record)
+    jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/e01919e8/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 49be304..27f398e 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
+  private val badJson = "\u0000\u0000\u0000A\u0001AAA"
+
   // JSON does not write data of NullType and does not play well with 
BinaryType.
   override protected def supportsDataType(dataType: DataType): Boolean = 
dataType match {
     case _: NullType => false
@@ -105,4 +107,36 @@ class JsonHadoopFsRelationSuite extends 
HadoopFsRelationTest {
       )
     }
   }
+
+  test("invalid json with leading nulls - from file (multiLine=true)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val expected = s"""$badJson\n{"a":1}\n"""
+      val schema = new StructType().add("a", 
IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", 
true).schema(schema).load(path)
+      checkAnswer(df, Row(null, expected))
+    }
+  }
+
+  test("invalid json with leading nulls - from file (multiLine=false)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val schema = new StructType().add("a", 
IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", 
false).schema(schema).load(path)
+      checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
+    }
+  }
+
+  test("invalid json with leading nulls - from dataset") {
+    import testImplicits._
+    checkAnswer(
+      spark.read.json(Seq(badJson).toDS()),
+      Row(badJson))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-23094] Fix invalid character handling in JsonDataSource

Reply via email to