[GitHub] spark pull request #20937: [SPARK-23094][SPARK-23723][SPARK-23724][SQL] Supp...

HyukjinKwon Sat, 28 Apr 2018 20:20:07 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20937#discussion_r184870196
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
    @@ -2167,4 +2171,241 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
         val sampled = spark.read.option("samplingRatio", 1.0).json(ds)
         assert(sampled.count() == ds.count())
       }
    +
    +  test("SPARK-23723: json in UTF-16 with BOM") {
    +    val fileName = "test-data/utf16WithBOM.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .option("encoding", "UTF-16")
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird"), Row("Doug", "Rood")))
    +  }
    +
    +  test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
    +    val fileName = "test-data/utf32BEWithBOM.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +  test("SPARK-23723: Use user's encoding in reading of multi-line json in 
UTF-16LE") {
    +    val fileName = "test-data/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .options(Map("encoding" -> "UTF-16LE"))
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +  test("SPARK-23723: Unsupported encoding name") {
    +    val invalidCharset = "UTF-128"
    +    val exception = intercept[UnsupportedCharsetException] {
    +      spark.read
    +        .options(Map("encoding" -> invalidCharset, "lineSep" -> "\n"))
    +        .json(testFile("test-data/utf16LE.json"))
    +        .count()
    +    }
    +
    +    assert(exception.getMessage.contains(invalidCharset))
    +  }
    +
    +  test("SPARK-23723: checking that the encoding option is case agnostic") {
    +    val fileName = "test-data/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .options(Map("encoding" -> "uTf-16lE"))
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +
    +  test("SPARK-23723: specified encoding is not matched to actual 
encoding") {
    +    val fileName = "test-data/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val exception = intercept[SparkException] {
    +      spark.read.schema(schema)
    +        .option("mode", "FAILFAST")
    +        .option("multiline", "true")
    +        .options(Map("encoding" -> "UTF-16BE"))
    +        .json(testFile(fileName))
    +        .count()
    +    }
    +    val errMsg = exception.getMessage
    +
    +    assert(errMsg.contains("Malformed records are detected in record 
parsing"))
    +  }
    +
    +  def checkEncoding(expectedEncoding: String, pathToJsonFiles: String,
    +      expectedContent: String): Unit = {
    +    val jsonFiles = new File(pathToJsonFiles)
    +      .listFiles()
    +      .filter(_.isFile)
    +      .filter(_.getName.endsWith("json"))
    +    val actualContent = jsonFiles.map { file =>
    +      new String(Files.readAllBytes(file.toPath), expectedEncoding)
    +    }.mkString.trim
    +
    +    assert(actualContent == expectedContent)
    +  }
    +
    +  test("SPARK-23723: save json in UTF-32BE") {
    +    val encoding = "UTF-32BE"
    +    withTempPath { path =>
    +      val df = spark.createDataset(Seq(("Dog", 42)))
    +      df.write
    +        .options(Map("encoding" -> encoding, "lineSep" -> "\n"))
    +        .json(path.getCanonicalPath)
    +
    +      checkEncoding(
    +        expectedEncoding = encoding,
    +        pathToJsonFiles = path.getCanonicalPath,
    +        expectedContent = """{"_1":"Dog","_2":42}""")
    +    }
    +  }
    +
    +  test("SPARK-23723: save json in default encoding - UTF-8") {
    +    withTempPath { path =>
    +      val df = spark.createDataset(Seq(("Dog", 42)))
    +      df.write.json(path.getCanonicalPath)
    +
    +      checkEncoding(
    +        expectedEncoding = "UTF-8",
    +        pathToJsonFiles = path.getCanonicalPath,
    +        expectedContent = """{"_1":"Dog","_2":42}""")
    +    }
    +  }
    +
    +  test("SPARK-23723: wrong output encoding") {
    +    val encoding = "UTF-128"
    +    val exception = intercept[UnsupportedCharsetException] {
    +      withTempPath { path =>
    +        val df = spark.createDataset(Seq((0)))
    +        df.write
    +          .options(Map("encoding" -> encoding, "lineSep" -> "\n"))
    +          .json(path.getCanonicalPath)
    +      }
    +    }
    +
    +    assert(exception.getMessage == encoding)
    +  }
    +
    +  test("SPARK-23723: read back json in UTF-16LE") {
    +    val options = Map("encoding" -> "UTF-16LE", "lineSep" -> "\n")
    +    withTempPath { path =>
    +      val ds = spark.createDataset(Seq(("a", 1), ("b", 2), ("c", 
3))).repartition(2)
    +      ds.write.options(options).json(path.getCanonicalPath)
    +
    +      val readBack = spark
    +        .read
    +        .options(options)
    +        .json(path.getCanonicalPath)
    +
    +      checkAnswer(readBack.toDF(), ds.toDF())
    +    }
    +  }
    +
    +  def checkReadJson(lineSep: String, encoding: String, inferSchema: 
Boolean, id: Int): Unit = {
    +    test(s"SPARK-23724: checks reading json in ${encoding} #${id}") {
    +      val schema = new StructType().add("f1", StringType).add("f2", 
IntegerType)
    +      withTempPath { path =>
    +        val records = List(("a", 1), ("b", 2))
    +        val data = records
    +          .map(rec => s"""{"f1":"${rec._1}", 
"f2":${rec._2}}""".getBytes(encoding))
    +          .reduce((a1, a2) => a1 ++ lineSep.getBytes(encoding) ++ a2)
    +        val os = new FileOutputStream(path)
    +        os.write(data)
    +        os.close()
    +        val reader = if (inferSchema) {
    +          spark.read
    +        } else {
    +          spark.read.schema(schema)
    +        }
    +        val readBack = reader
    +          .option("encoding", encoding)
    +          .option("lineSep", lineSep)
    +          .json(path.getCanonicalPath)
    +        checkAnswer(readBack, records.map(rec => Row(rec._1, rec._2)))
    +      }
    +    }
    +  }
    +
    +  // scalastyle:off nonascii
    +  List(
    +    (0, "|", "UTF-8", false),
    +    (1, "^", "UTF-16BE", true),
    +    (2, "::", "ISO-8859-1", true),
    +    (3, "!!!@3", "UTF-32LE", false),
    +    (4, 0x1E.toChar.toString, "UTF-8", true),
    +    (5, "ì", "UTF-32BE", false),
    +    (6, "ÐºÑÐºÑ", "CP1251", true),
    +    (7, "sep", "utf-8", false),
    +    (8, "\r\n", "UTF-16LE", false),
    +    (9, "\r\n", "utf-16be", true),
    +    (10, "\u000d\u000a", "UTF-32BE", false),
    +    (11, "\u000a\u000d", "UTF-8", true),
    +    (12, "===", "US-ASCII", false),
    +    (13, "$^+", "utf-32le", true)
    +  ).foreach {
    +    case (testNum, sep, encoding, inferSchema) => checkReadJson(sep, 
encoding, inferSchema, testNum)
    --- End diff --
    
    ```
    foreach { case (testNum, sep, encoding, inferSchema) =>
      ...
    }
    ```
    
    This is actually a style - 
https://github.com/databricks/scala-style-guide#pattern-matching
    
    not a big deal



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #20937: [SPARK-23094][SPARK-23723][SPARK-23724][SQL] Supp...

Reply via email to