[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178427033 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala --- @@ -39,11 +40,36 @@ private[sql] object CreateJacksonParser extends Serializable { jsonFactory.createParser(new InputStreamReader(bain, "UTF-8")) } - def text(jsonFactory: JsonFactory, record: Text): JsonParser = { -jsonFactory.createParser(record.getBytes, 0, record.getLength) + def text(jsonFactory: JsonFactory, record: Text, charset: Option[String] = None): JsonParser = { +charset match { --- End diff -- `:%s/charset/encoding` .. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178427011 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala --- @@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } - def testLineSeparator(lineSep: String): Unit = { -test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") { - // Read - val data = -s""" - | {"f": - |"a", "f0": 1}$lineSep{"f": - | - |"c", "f0": 2}$lineSep{"f": "d", "f0": 3} -""".stripMargin - val dataWithTrailingLineSep = s"$data$lineSep" - - Seq(data, dataWithTrailingLineSep).foreach { lines => -withTempPath { path => - Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8)) - val df = spark.read.option("lineSep", lineSep).json(path.getAbsolutePath) - val expectedSchema = -StructType(StructField("f", StringType) :: StructField("f0", LongType) :: Nil) - checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF()) - assert(df.schema === expectedSchema) + def testFile(fileName: String): String = { + Thread.currentThread().getContextClassLoader.getResource(fileName).toString + } + + test("SPARK-23723: json in UTF-16 with BOM") { +val fileName = "json-tests/utf16WithBOM.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + // This option will be replaced by .option("lineSep", "x00 0a") + // as soon as lineSep allows to specify sequence of bytes in hexadecimal format. + .option("mode", "DROPMALFORMED") + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq( + Row("Chris", "Baird"), Row("Doug", "Rood") +)) + } + + test("SPARK-23723: multi-line json in UTF-32BE with BOM") { +val fileName = "json-tests/utf32BEWithBOM.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Use user's encoding in reading of multi-line json in UTF-16LE") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("encoding" -> "UTF-16LE")) + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Unsupported charset name") { +val invalidCharset = "UTF-128" +val exception = intercept[java.io.UnsupportedEncodingException] { + spark.read +.options(Map("charset" -> invalidCharset, "lineSep" -> "\n")) +.json(testFile("json-tests/utf16LE.json")) +.count() +} + +assert(exception.getMessage.contains(invalidCharset)) + } + + test("SPARK-23723: checking that the charset option is case agnostic") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("charset" -> "uTf-16lE")) + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + + test("SPARK-23723: specified charset is not matched to actual charset") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val exception = intercept[SparkException] { + spark.read.schema(schema) +.option("mode", "FAILFAST") +.option("multiline", "true") +.options(Map("charset" -> "UTF-16BE")) +.json(testFile(fileName)) +.count() +} +val errMsg = exception.getMessage + +assert(errMsg.contains("Malformed records are detected in record parsing")) + } + + def checkCharset( +expectedCharset: String, +pathToJsonFiles: String, +expectedContent: String + ): Unit = { +val jsonFiles = new File(pathToJsonFiles) + .listFiles() + .filter(_.isFile) + .filter(_.getName.endsWith("json")) +val jsonContent = jsonFiles.map { file => +
[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178426994 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala --- @@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } - def testLineSeparator(lineSep: String): Unit = { -test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") { - // Read - val data = -s""" - | {"f": - |"a", "f0": 1}$lineSep{"f": - | - |"c", "f0": 2}$lineSep{"f": "d", "f0": 3} -""".stripMargin - val dataWithTrailingLineSep = s"$data$lineSep" - - Seq(data, dataWithTrailingLineSep).foreach { lines => -withTempPath { path => - Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8)) - val df = spark.read.option("lineSep", lineSep).json(path.getAbsolutePath) - val expectedSchema = -StructType(StructField("f", StringType) :: StructField("f0", LongType) :: Nil) - checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF()) - assert(df.schema === expectedSchema) + def testFile(fileName: String): String = { + Thread.currentThread().getContextClassLoader.getResource(fileName).toString + } + + test("SPARK-23723: json in UTF-16 with BOM") { +val fileName = "json-tests/utf16WithBOM.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + // This option will be replaced by .option("lineSep", "x00 0a") + // as soon as lineSep allows to specify sequence of bytes in hexadecimal format. + .option("mode", "DROPMALFORMED") + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq( + Row("Chris", "Baird"), Row("Doug", "Rood") +)) + } + + test("SPARK-23723: multi-line json in UTF-32BE with BOM") { +val fileName = "json-tests/utf32BEWithBOM.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Use user's encoding in reading of multi-line json in UTF-16LE") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("encoding" -> "UTF-16LE")) + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Unsupported charset name") { +val invalidCharset = "UTF-128" +val exception = intercept[java.io.UnsupportedEncodingException] { + spark.read +.options(Map("charset" -> invalidCharset, "lineSep" -> "\n")) +.json(testFile("json-tests/utf16LE.json")) +.count() +} + +assert(exception.getMessage.contains(invalidCharset)) + } + + test("SPARK-23723: checking that the charset option is case agnostic") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("charset" -> "uTf-16lE")) + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + + test("SPARK-23723: specified charset is not matched to actual charset") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val exception = intercept[SparkException] { + spark.read.schema(schema) +.option("mode", "FAILFAST") +.option("multiline", "true") +.options(Map("charset" -> "UTF-16BE")) --- End diff -- If that's difficult (or weird or hacky), let's just make whitelist and document them explicitly. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178426903 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala --- @@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } - def testLineSeparator(lineSep: String): Unit = { --- End diff -- You don't have to completely remove this out .. just take out the ones not working. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178426879 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala --- @@ -86,14 +86,30 @@ private[sql] class JSONOptions( val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) - val lineSeparator: Option[String] = parameters.get("lineSep").map { sep => -require(sep.nonEmpty, "'lineSep' cannot be an empty string.") -sep + /** + * A sequence of bytes between two consecutive json records. + */ + val lineSeparator: Option[String] = parameters.get("lineSep") + + /** + * Standard charset name. For example UTF-8, UTF-16 and UTF-32. + * If charset is not specified (None), it will be detected automatically. + */ + val charset: Option[String] = parameters.get("charset") --- End diff -- I thought we talked `encoding` would have higher priority like CSV. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20937#discussion_r178426866 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala --- @@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } - def testLineSeparator(lineSep: String): Unit = { -test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") { - // Read - val data = -s""" - | {"f": - |"a", "f0": 1}$lineSep{"f": - | - |"c", "f0": 2}$lineSep{"f": "d", "f0": 3} -""".stripMargin - val dataWithTrailingLineSep = s"$data$lineSep" - - Seq(data, dataWithTrailingLineSep).foreach { lines => -withTempPath { path => - Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8)) - val df = spark.read.option("lineSep", lineSep).json(path.getAbsolutePath) - val expectedSchema = -StructType(StructField("f", StringType) :: StructField("f0", LongType) :: Nil) - checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF()) - assert(df.schema === expectedSchema) + def testFile(fileName: String): String = { + Thread.currentThread().getContextClassLoader.getResource(fileName).toString + } + + test("SPARK-23723: json in UTF-16 with BOM") { +val fileName = "json-tests/utf16WithBOM.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + // This option will be replaced by .option("lineSep", "x00 0a") + // as soon as lineSep allows to specify sequence of bytes in hexadecimal format. + .option("mode", "DROPMALFORMED") + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq( + Row("Chris", "Baird"), Row("Doug", "Rood") +)) + } + + test("SPARK-23723: multi-line json in UTF-32BE with BOM") { +val fileName = "json-tests/utf32BEWithBOM.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Use user's encoding in reading of multi-line json in UTF-16LE") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("encoding" -> "UTF-16LE")) + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + test("SPARK-23723: Unsupported charset name") { +val invalidCharset = "UTF-128" +val exception = intercept[java.io.UnsupportedEncodingException] { + spark.read +.options(Map("charset" -> invalidCharset, "lineSep" -> "\n")) +.json(testFile("json-tests/utf16LE.json")) +.count() +} + +assert(exception.getMessage.contains(invalidCharset)) + } + + test("SPARK-23723: checking that the charset option is case agnostic") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val jsonDF = spark.read.schema(schema) + .option("multiline", "true") + .options(Map("charset" -> "uTf-16lE")) + .json(testFile(fileName)) + +checkAnswer(jsonDF, Seq(Row("Chris", "Baird"))) + } + + + test("SPARK-23723: specified charset is not matched to actual charset") { +val fileName = "json-tests/utf16LE.json" +val schema = new StructType().add("firstName", StringType).add("lastName", StringType) +val exception = intercept[SparkException] { + spark.read.schema(schema) +.option("mode", "FAILFAST") +.option("multiline", "true") +.options(Map("charset" -> "UTF-16BE")) --- End diff -- Can we have a test case with `UTF-16`? I assume the delimiter itself has the BOM and won't work correctly? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20849: [SPARK-23723] New charset option for json datasource
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/20849 Let's make the point clear. There are two things, _1. one for line-by-line parsing_ and _2. JSON parsing via Jackson_. The test you pointed out looks still a bit weird because Jackson is going to detect the encoding for each line not the whole file. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20849: [SPARK-23723] New charset option for json datasource
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/20849 From a quick look and wild guess, `UTF-16` case would be alone problematic because we are going to make the delimiter with a BOM bit `0xFF 0xFE 0x0D 0x00 0x0A 0x00`. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20849: [SPARK-23723] New charset option for json datasource
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/20849 @MaxGekk, So to make it clear, it parses line by line correctly regardless of BOM if we set `lineSep` + `encoding` fine but it fails to parse each line as JSON via Jackson since we explicitly set `UTF-16LE` or `UTF-16` for JSON parsing? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18853 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18853 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/testing-k8s-prb-make-spark-distribution/1882/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/18853 **[Test build #88780 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88780/testReport)** for PR 18853 at commit [`81067b9`](https://github.com/apache/spark/commit/81067b984b95e04429db1ad19e6161c31a8228c0). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/20935#discussion_r178425464 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala --- @@ -18,18 +18,35 @@ package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.expressions.RowOrdering +import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types._ class ColumnStatsSuite extends SparkFunSuite { - testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 0)) - testColumnStats(classOf[ByteColumnStats], BYTE, Array(Byte.MaxValue, Byte.MinValue, 0)) - testColumnStats(classOf[ShortColumnStats], SHORT, Array(Short.MaxValue, Short.MinValue, 0)) - testColumnStats(classOf[IntColumnStats], INT, Array(Int.MaxValue, Int.MinValue, 0)) - testColumnStats(classOf[LongColumnStats], LONG, Array(Long.MaxValue, Long.MinValue, 0)) - testColumnStats(classOf[FloatColumnStats], FLOAT, Array(Float.MaxValue, Float.MinValue, 0)) - testColumnStats(classOf[DoubleColumnStats], DOUBLE, Array(Double.MaxValue, Double.MinValue, 0)) - testColumnStats(classOf[StringColumnStats], STRING, Array(null, null, 0)) - testDecimalColumnStats(Array(null, null, 0)) + testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 0, 0, 0)) --- End diff -- Those changes to `testColumnStats` seems unnecessary? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/20935#discussion_r178425406 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala --- @@ -323,18 +324,31 @@ private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) ext } private[columnar] final class ObjectColumnStats(dataType: DataType) extends ColumnStats { + protected var upper: Any = null + protected var lower: Any = null + val columnType = ColumnType(dataType) + val ordering = dataType match { +case x if RowOrdering.isOrderable(dataType) && x != NullType => + Option(TypeUtils.getInterpretedOrdering(x)) +case _ => None + } override def gatherStats(row: InternalRow, ordinal: Int): Unit = { if (!row.isNullAt(ordinal)) { val size = columnType.actualSize(row, ordinal) sizeInBytes += size count += 1 + ordering.foreach { order => +val value = row.get(ordinal, dataType) +if (upper == null || order.gt(value, upper)) upper = value +if (lower == null || order.lt(value, lower)) lower = value --- End diff -- For unsafe row and array, doesn't we need to copy the value? In the added test this can't be tested because the random rows are all individual instances, however, it can be the same instance of unsafe row or array during query evaluation. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/20935#discussion_r178425525 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala --- @@ -54,12 +54,22 @@ object ColumnarTestUtils { case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => -new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10 +val schema = StructType(Array(StructField("test", StringType))) +val converter = UnsafeProjection.create(schema) + converter(InternalRow(Array(UTF8String.fromString(Random.nextString(10))): _*)) case ARRAY(_) => -new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) +UnsafeArrayData.fromPrimitiveArray(Array(Random.nextInt(), Random.nextInt())) case MAP(_) => -ArrayBasedMapData( - Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32) +val unsafeConverter = + UnsafeProjection.create(Array[DataType](MapType(IntegerType, StringType))) +val row = new GenericInternalRow(1) +def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { + row.update(0, map) + val unsafeRow = unsafeConverter.apply(row) + unsafeRow.getMap(0).copy +} +toUnsafeMap(ArrayBasedMapData( + Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)) --- End diff -- Seems above changes to data generation are unnecessary too? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user wangyum commented on the issue: https://github.com/apache/spark/pull/18853 retest this please --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20954: [BACKPORT][SPARK-23040][CORE] Returns interruptible iter...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20954 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88777/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20954: [BACKPORT][SPARK-23040][CORE] Returns interruptible iter...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20954 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18853 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18853 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88778/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/18853 **[Test build #88778 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88778/testReport)** for PR 18853 at commit [`81067b9`](https://github.com/apache/spark/commit/81067b984b95e04429db1ad19e6161c31a8228c0). * This patch **fails due to an unknown error code, -9**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20954: [BACKPORT][SPARK-23040][CORE] Returns interruptible iter...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20954 **[Test build #88777 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88777/testReport)** for PR 20954 at commit [`e9a661f`](https://github.com/apache/spark/commit/e9a661f8270a1d1366951038907b12abaa08aa2f). * This patch **fails due to an unknown error code, -9**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20940: [SPARK-23429][CORE] Add executor memory metrics to heart...
Github user felixcheung commented on the issue: https://github.com/apache/spark/pull/20940 @edwinalu please take a look https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88725/testReport --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20907: [WIP][SPARK-11237][ML] Add pmml export for k-mean...
Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/20907#discussion_r178425007 --- Diff: mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala --- @@ -185,6 +187,47 @@ class KMeansModel private[ml] ( } } +/** Helper class for storing model data */ +private case class ClusterData(clusterIdx: Int, clusterCenter: Vector) + + +/** A writer for KMeans that handles the "internal" (or default) format */ +private class InternalKMeansModelWriter extends MLWriterFormat with MLFormatRegister { + + override def format(): String = "internal" + override def stageName(): String = "org.apache.spark.ml.clustering.KMeansModel" + + override def write(path: String, sparkSession: SparkSession, +optionMap: mutable.Map[String, String], stage: PipelineStage): Unit = { +val instance = stage.asInstanceOf[KMeansModel] +val sc = sparkSession.sparkContext +// Save metadata and Params +DefaultParamsWriter.saveMetadata(instance, path, sc) +// Save model data: cluster centers +val data: Array[ClusterData] = instance.clusterCenters.zipWithIndex.map { + case (center, idx) => +ClusterData(idx, center) --- End diff -- ð --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20949 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20949 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88779/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20949 **[Test build #88779 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88779/testReport)** for PR 20949 at commit [`b9a7bf0`](https://github.com/apache/spark/commit/b9a7bf03b312da151e1d7e37338092bbf5bcb38a). * This patch **fails Scala style tests**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20949 **[Test build #88779 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88779/testReport)** for PR 20949 at commit [`b9a7bf0`](https://github.com/apache/spark/commit/b9a7bf03b312da151e1d7e37338092bbf5bcb38a). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20949: [SPARK-19018][SQL] Add support for custom encodin...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20949#discussion_r178424628 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala --- @@ -513,6 +513,43 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } + test("Save csv with custom charset") { +Seq("iso-8859-1", "utf-8", "windows-1250").foreach { encoding => + withTempDir { dir => +val csvDir = new File(dir, "csv").getCanonicalPath +// scalastyle:off +val originalDF = Seq("µà áâä ÃÃÃ").toDF("_c0") +// scalastyle:on +originalDF.write + .option("header", "false") + .option("encoding", encoding) + .csv(csvDir) + +val df = spark + .read + .option("header", "false") + .option("encoding", encoding) --- End diff -- I think our CSV read encoding option is incomplete for now .. there are many discussions about this now. I am going to fix the read path soon. Let me revisit this after fixing it. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/20949 ok to test --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20948: corrected filename for spark config
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/20948 I think https://github.com/apache/spark/pull/20928 includes this. Let that PR fix it and close this one. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19560: [SPARK-22334][SQL] Check table size from filesystem in c...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19560 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19560: [SPARK-22334][SQL] Check table size from filesystem in c...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19560 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88775/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19560: [SPARK-22334][SQL] Check table size from filesystem in c...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19560 **[Test build #88775 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88775/testReport)** for PR 19560 at commit [`78b34bd`](https://github.com/apache/spark/commit/78b34bd7b79550b23730e1c9cdf06620e52b66f2). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org