Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178426994 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala --- @@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } - def testLineSeparator(lineSep: String): Unit = { - test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") { - // Read - val data = - s""" - | {"f": - |"a", "f0": 1}$lineSep{"f": - | - |"c", "f0": 2}$lineSep{"f": "d", "f0": 3} - """.stripMargin - val dataWithTrailingLineSep = s"$data$lineSep" - - Seq(data, dataWithTrailingLineSep).foreach { lines => - withTempPath { path => - Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8)) - val df = spark.read.option("lineSep", lineSep).json(path.getAbsolutePath) - val expectedSchema = - StructType(StructField("f", StringType) :: StructField("f0", LongType) :: Nil) - checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF()) - assert(df.schema === expectedSchema) + def testFile(fileName: String): String = { + Thread.currentThread().getContextClassLoader.getResource(fileName).toString + } + + test("SPARK-23723: json in UTF-16 with BOM") { + val fileName = "json-tests/utf16WithBOM.json" + val schema = new StructType().add("firstName", StringType).add("lastName", StringType) + val jsonDF = spark.read.schema(schema) + // This option will be replaced by .option("lineSep", "x00 0a") + // as soon as lineSep allows to specify sequence of bytes in hexadecimal format. + .option("mode", "DROPMALFORMED") + .json(testFile(fileName)) + + checkAnswer(jsonDF, Seq( + Row("Chris", "Baird"), Row("Doug", "Rood") + )) + } + + test("SPARK-23723: multi-line json in UTF-32BE with BOM") { + val fileName = "json-tests/utf32BEWithBOM.json" + val schema = new StructType().add("firstName", StringType).add("lastName", StringType) + val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .json(testFile(fileName)) + + checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Use user's encoding in reading of multi-line json in UTF-16LE") { + val fileName = "json-tests/utf16LE.json" + val schema = new StructType().add("firstName", StringType).add("lastName", StringType) + val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("encoding" -> "UTF-16LE")) + .json(testFile(fileName)) + + checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Unsupported charset name") { + val invalidCharset = "UTF-128" + val exception = intercept[java.io.UnsupportedEncodingException] { + spark.read + .options(Map("charset" -> invalidCharset, "lineSep" -> "\n")) + .json(testFile("json-tests/utf16LE.json")) + .count() + } + + assert(exception.getMessage.contains(invalidCharset)) + } + + test("SPARK-23723: checking that the charset option is case agnostic") { + val fileName = "json-tests/utf16LE.json" + val schema = new StructType().add("firstName", StringType).add("lastName", StringType) + val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("charset" -> "uTf-16lE")) + .json(testFile(fileName)) + + checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + + test("SPARK-23723: specified charset is not matched to actual charset") { + val fileName = "json-tests/utf16LE.json" + val schema = new StructType().add("firstName", StringType).add("lastName", StringType) + val exception = intercept[SparkException] { + spark.read.schema(schema) + .option("mode", "FAILFAST") + .option("multiline", "true") + .options(Map("charset" -> "UTF-16BE")) --- End diff -- If that's difficult (or weird or hacky), let's just make whitelist and document them explicitly.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org