[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20937#discussion_r178427033
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
 ---
@@ -39,11 +40,36 @@ private[sql] object CreateJacksonParser extends 
Serializable {
 jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
   }
 
-  def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-jsonFactory.createParser(record.getBytes, 0, record.getLength)
+  def text(jsonFactory: JsonFactory, record: Text, charset: Option[String] 
= None): JsonParser = {
+charset match {
--- End diff --

`:%s/charset/encoding` .. 


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20937#discussion_r178427011
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
@@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
 }
   }
 
-  def testLineSeparator(lineSep: String): Unit = {
-test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") {
-  // Read
-  val data =
-s"""
-  |  {"f":
-  |"a", "f0": 1}$lineSep{"f":
-  |
-  |"c",  "f0": 2}$lineSep{"f": "d",  "f0": 3}
-""".stripMargin
-  val dataWithTrailingLineSep = s"$data$lineSep"
-
-  Seq(data, dataWithTrailingLineSep).foreach { lines =>
-withTempPath { path =>
-  Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8))
-  val df = spark.read.option("lineSep", 
lineSep).json(path.getAbsolutePath)
-  val expectedSchema =
-StructType(StructField("f", StringType) :: StructField("f0", 
LongType) :: Nil)
-  checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF())
-  assert(df.schema === expectedSchema)
+  def testFile(fileName: String): String = {
+
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
+  }
+
+  test("SPARK-23723: json in UTF-16 with BOM") {
+val fileName = "json-tests/utf16WithBOM.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  // This option will be replaced by .option("lineSep", "x00 0a")
+  // as soon as lineSep allows to specify sequence of bytes in 
hexadecimal format.
+  .option("mode", "DROPMALFORMED")
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(
+  Row("Chris", "Baird"), Row("Doug", "Rood")
+))
+  }
+
+  test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
+val fileName = "json-tests/utf32BEWithBOM.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+  test("SPARK-23723: Use user's encoding in reading of multi-line json in 
UTF-16LE") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .options(Map("encoding" -> "UTF-16LE"))
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+  test("SPARK-23723: Unsupported charset name") {
+val invalidCharset = "UTF-128"
+val exception = intercept[java.io.UnsupportedEncodingException] {
+  spark.read
+.options(Map("charset" -> invalidCharset, "lineSep" -> "\n"))
+.json(testFile("json-tests/utf16LE.json"))
+.count()
+}
+
+assert(exception.getMessage.contains(invalidCharset))
+  }
+
+  test("SPARK-23723: checking that the charset option is case agnostic") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .options(Map("charset" -> "uTf-16lE"))
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+
+  test("SPARK-23723: specified charset is not matched to actual charset") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val exception = intercept[SparkException] {
+  spark.read.schema(schema)
+.option("mode", "FAILFAST")
+.option("multiline", "true")
+.options(Map("charset" -> "UTF-16BE"))
+.json(testFile(fileName))
+.count()
+}
+val errMsg = exception.getMessage
+
+assert(errMsg.contains("Malformed records are detected in record 
parsing"))
+  }
+
+  def checkCharset(
+expectedCharset: String,
+pathToJsonFiles: String,
+expectedContent: String
+  ): Unit = {
+val jsonFiles = new File(pathToJsonFiles)
+  .listFiles()
+  .filter(_.isFile)
+  .filter(_.getName.endsWith("json"))
+val jsonContent = jsonFiles.map { file =>
+  

[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20937#discussion_r178426994
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
@@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
 }
   }
 
-  def testLineSeparator(lineSep: String): Unit = {
-test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") {
-  // Read
-  val data =
-s"""
-  |  {"f":
-  |"a", "f0": 1}$lineSep{"f":
-  |
-  |"c",  "f0": 2}$lineSep{"f": "d",  "f0": 3}
-""".stripMargin
-  val dataWithTrailingLineSep = s"$data$lineSep"
-
-  Seq(data, dataWithTrailingLineSep).foreach { lines =>
-withTempPath { path =>
-  Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8))
-  val df = spark.read.option("lineSep", 
lineSep).json(path.getAbsolutePath)
-  val expectedSchema =
-StructType(StructField("f", StringType) :: StructField("f0", 
LongType) :: Nil)
-  checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF())
-  assert(df.schema === expectedSchema)
+  def testFile(fileName: String): String = {
+
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
+  }
+
+  test("SPARK-23723: json in UTF-16 with BOM") {
+val fileName = "json-tests/utf16WithBOM.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  // This option will be replaced by .option("lineSep", "x00 0a")
+  // as soon as lineSep allows to specify sequence of bytes in 
hexadecimal format.
+  .option("mode", "DROPMALFORMED")
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(
+  Row("Chris", "Baird"), Row("Doug", "Rood")
+))
+  }
+
+  test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
+val fileName = "json-tests/utf32BEWithBOM.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+  test("SPARK-23723: Use user's encoding in reading of multi-line json in 
UTF-16LE") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .options(Map("encoding" -> "UTF-16LE"))
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+  test("SPARK-23723: Unsupported charset name") {
+val invalidCharset = "UTF-128"
+val exception = intercept[java.io.UnsupportedEncodingException] {
+  spark.read
+.options(Map("charset" -> invalidCharset, "lineSep" -> "\n"))
+.json(testFile("json-tests/utf16LE.json"))
+.count()
+}
+
+assert(exception.getMessage.contains(invalidCharset))
+  }
+
+  test("SPARK-23723: checking that the charset option is case agnostic") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .options(Map("charset" -> "uTf-16lE"))
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+
+  test("SPARK-23723: specified charset is not matched to actual charset") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val exception = intercept[SparkException] {
+  spark.read.schema(schema)
+.option("mode", "FAILFAST")
+.option("multiline", "true")
+.options(Map("charset" -> "UTF-16BE"))
--- End diff --

If that's difficult (or weird or hacky), let's just make whitelist and 
document them explicitly.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20937#discussion_r178426903
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
@@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
 }
   }
 
-  def testLineSeparator(lineSep: String): Unit = {
--- End diff --

You don't have to completely remove this out .. just take out the ones not 
working.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20937#discussion_r178426879
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
 ---
@@ -86,14 +86,30 @@ private[sql] class JSONOptions(
 
   val multiLine = 
parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
 
-  val lineSeparator: Option[String] = parameters.get("lineSep").map { sep 
=>
-require(sep.nonEmpty, "'lineSep' cannot be an empty string.")
-sep
+  /**
+   * A sequence of bytes between two consecutive json records.
+   */
+  val lineSeparator: Option[String] = parameters.get("lineSep")
+
+  /**
+   * Standard charset name. For example UTF-8, UTF-16 and UTF-32.
+   * If charset is not specified (None), it will be detected automatically.
+   */
+  val charset: Option[String] = parameters.get("charset")
--- End diff --

I thought we talked `encoding` would have higher priority like CSV.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20937#discussion_r178426866
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
@@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
 }
   }
 
-  def testLineSeparator(lineSep: String): Unit = {
-test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") {
-  // Read
-  val data =
-s"""
-  |  {"f":
-  |"a", "f0": 1}$lineSep{"f":
-  |
-  |"c",  "f0": 2}$lineSep{"f": "d",  "f0": 3}
-""".stripMargin
-  val dataWithTrailingLineSep = s"$data$lineSep"
-
-  Seq(data, dataWithTrailingLineSep).foreach { lines =>
-withTempPath { path =>
-  Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8))
-  val df = spark.read.option("lineSep", 
lineSep).json(path.getAbsolutePath)
-  val expectedSchema =
-StructType(StructField("f", StringType) :: StructField("f0", 
LongType) :: Nil)
-  checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF())
-  assert(df.schema === expectedSchema)
+  def testFile(fileName: String): String = {
+
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
+  }
+
+  test("SPARK-23723: json in UTF-16 with BOM") {
+val fileName = "json-tests/utf16WithBOM.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  // This option will be replaced by .option("lineSep", "x00 0a")
+  // as soon as lineSep allows to specify sequence of bytes in 
hexadecimal format.
+  .option("mode", "DROPMALFORMED")
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(
+  Row("Chris", "Baird"), Row("Doug", "Rood")
+))
+  }
+
+  test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
+val fileName = "json-tests/utf32BEWithBOM.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+  test("SPARK-23723: Use user's encoding in reading of multi-line json in 
UTF-16LE") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .options(Map("encoding" -> "UTF-16LE"))
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+  test("SPARK-23723: Unsupported charset name") {
+val invalidCharset = "UTF-128"
+val exception = intercept[java.io.UnsupportedEncodingException] {
+  spark.read
+.options(Map("charset" -> invalidCharset, "lineSep" -> "\n"))
+.json(testFile("json-tests/utf16LE.json"))
+.count()
+}
+
+assert(exception.getMessage.contains(invalidCharset))
+  }
+
+  test("SPARK-23723: checking that the charset option is case agnostic") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val jsonDF = spark.read.schema(schema)
+  .option("multiline", "true")
+  .options(Map("charset" -> "uTf-16lE"))
+  .json(testFile(fileName))
+
+checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+  }
+
+
+  test("SPARK-23723: specified charset is not matched to actual charset") {
+val fileName = "json-tests/utf16LE.json"
+val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
+val exception = intercept[SparkException] {
+  spark.read.schema(schema)
+.option("mode", "FAILFAST")
+.option("multiline", "true")
+.options(Map("charset" -> "UTF-16BE"))
--- End diff --

Can we have a test case with `UTF-16`? I assume the delimiter itself has 
the BOM and won't work correctly?


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20849: [SPARK-23723] New charset option for json datasource

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on the issue:

https://github.com/apache/spark/pull/20849
  
Let's make the point clear. There are two things, _1. one for line-by-line 
parsing_ and _2. JSON parsing via Jackson_.

The test you pointed out looks still a bit weird because Jackson is going 
to detect the encoding for each line not the whole file.



---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20849: [SPARK-23723] New charset option for json datasource

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on the issue:

https://github.com/apache/spark/pull/20849
  
From a quick look and wild guess, `UTF-16` case would be alone problematic 
because we are going to make the delimiter with a BOM bit `0xFF 0xFE 0x0D 0x00 
0x0A 0x00`.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20849: [SPARK-23723] New charset option for json datasource

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on the issue:

https://github.com/apache/spark/pull/20849
  
@MaxGekk, So to make it clear, it parses line by line correctly regardless 
of BOM if we set `lineSep` + `encoding` fine but it fails to parse each line as 
JSON via Jackson since we explicitly set `UTF-16LE` or `UTF-16` for JSON 
parsing?


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/18853
  
Merged build finished. Test PASSed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/18853
  
Test PASSed.
Refer to this link for build results (access rights to CI server needed): 

https://amplab.cs.berkeley.edu/jenkins//job/testing-k8s-prb-make-spark-distribution/1882/
Test PASSed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread SparkQA
Github user SparkQA commented on the issue:

https://github.com/apache/spark/pull/18853
  
**[Test build #88780 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88780/testReport)**
 for PR 18853 at commit 
[`81067b9`](https://github.com/apache/spark/commit/81067b984b95e04429db1ad19e6161c31a8228c0).


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...

2018-03-31 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/20935#discussion_r178425464
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala
 ---
@@ -18,18 +18,35 @@
 package org.apache.spark.sql.execution.columnar
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.RowOrdering
+import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 class ColumnStatsSuite extends SparkFunSuite {
-  testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 
0))
-  testColumnStats(classOf[ByteColumnStats], BYTE, Array(Byte.MaxValue, 
Byte.MinValue, 0))
-  testColumnStats(classOf[ShortColumnStats], SHORT, Array(Short.MaxValue, 
Short.MinValue, 0))
-  testColumnStats(classOf[IntColumnStats], INT, Array(Int.MaxValue, 
Int.MinValue, 0))
-  testColumnStats(classOf[LongColumnStats], LONG, Array(Long.MaxValue, 
Long.MinValue, 0))
-  testColumnStats(classOf[FloatColumnStats], FLOAT, Array(Float.MaxValue, 
Float.MinValue, 0))
-  testColumnStats(classOf[DoubleColumnStats], DOUBLE, 
Array(Double.MaxValue, Double.MinValue, 0))
-  testColumnStats(classOf[StringColumnStats], STRING, Array(null, null, 0))
-  testDecimalColumnStats(Array(null, null, 0))
+  testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 
0, 0, 0))
--- End diff --

Those changes to `testColumnStats` seems unnecessary? 


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...

2018-03-31 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/20935#discussion_r178425406
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -323,18 +324,31 @@ private[columnar] final class 
DecimalColumnStats(precision: Int, scale: Int) ext
 }
 
 private[columnar] final class ObjectColumnStats(dataType: DataType) 
extends ColumnStats {
+  protected var upper: Any = null
+  protected var lower: Any = null
+
   val columnType = ColumnType(dataType)
+  val ordering = dataType match {
+case x if RowOrdering.isOrderable(dataType) && x != NullType =>
+  Option(TypeUtils.getInterpretedOrdering(x))
+case _ => None
+  }
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
 if (!row.isNullAt(ordinal)) {
   val size = columnType.actualSize(row, ordinal)
   sizeInBytes += size
   count += 1
+  ordering.foreach { order =>
+val value = row.get(ordinal, dataType)
+if (upper == null || order.gt(value, upper)) upper = value
+if (lower == null || order.lt(value, lower)) lower = value
--- End diff --

For unsafe row and array, doesn't we need to copy the value? In the added 
test this can't be tested because the random rows are all individual instances, 
however, it can be the same instance of unsafe row or array during query 
evaluation.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...

2018-03-31 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/20935#discussion_r178425525
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala
 ---
@@ -54,12 +54,22 @@ object ColumnarTestUtils {
   case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() 
% 100, precision, scale)
   case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), 
precision, scale)
   case STRUCT(_) =>
-new 
GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10
+val schema = StructType(Array(StructField("test", StringType)))
+val converter = UnsafeProjection.create(schema)
+
converter(InternalRow(Array(UTF8String.fromString(Random.nextString(10))): _*))
   case ARRAY(_) =>
-new GenericArrayData(Array[Any](Random.nextInt(), 
Random.nextInt()))
+UnsafeArrayData.fromPrimitiveArray(Array(Random.nextInt(), 
Random.nextInt()))
   case MAP(_) =>
-ArrayBasedMapData(
-  Map(Random.nextInt() -> 
UTF8String.fromString(Random.nextString(Random.nextInt(32)
+val unsafeConverter =
+  UnsafeProjection.create(Array[DataType](MapType(IntegerType, 
StringType)))
+val row = new GenericInternalRow(1)
+def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
+  row.update(0, map)
+  val unsafeRow = unsafeConverter.apply(row)
+  unsafeRow.getMap(0).copy
+}
+toUnsafeMap(ArrayBasedMapData(
+  Map(Random.nextInt() -> 
UTF8String.fromString(Random.nextString(Random.nextInt(32))
--- End diff --

Seems above changes to data generation are unnecessary too?


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread wangyum
Github user wangyum commented on the issue:

https://github.com/apache/spark/pull/18853
  
retest this please


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20954: [BACKPORT][SPARK-23040][CORE] Returns interruptible iter...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/20954
  
Test FAILed.
Refer to this link for build results (access rights to CI server needed): 
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88777/
Test FAILed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20954: [BACKPORT][SPARK-23040][CORE] Returns interruptible iter...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/20954
  
Merged build finished. Test FAILed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/18853
  
Merged build finished. Test FAILed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/18853
  
Test FAILed.
Refer to this link for build results (access rights to CI server needed): 
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88778/
Test FAILed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #18853: [SPARK-21646][SQL] Add new type coercion to compatible w...

2018-03-31 Thread SparkQA
Github user SparkQA commented on the issue:

https://github.com/apache/spark/pull/18853
  
**[Test build #88778 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88778/testReport)**
 for PR 18853 at commit 
[`81067b9`](https://github.com/apache/spark/commit/81067b984b95e04429db1ad19e6161c31a8228c0).
 * This patch **fails due to an unknown error code, -9**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20954: [BACKPORT][SPARK-23040][CORE] Returns interruptible iter...

2018-03-31 Thread SparkQA
Github user SparkQA commented on the issue:

https://github.com/apache/spark/pull/20954
  
**[Test build #88777 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88777/testReport)**
 for PR 20954 at commit 
[`e9a661f`](https://github.com/apache/spark/commit/e9a661f8270a1d1366951038907b12abaa08aa2f).
 * This patch **fails due to an unknown error code, -9**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20940: [SPARK-23429][CORE] Add executor memory metrics to heart...

2018-03-31 Thread felixcheung
Github user felixcheung commented on the issue:

https://github.com/apache/spark/pull/20940
  
@edwinalu please take a look 
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88725/testReport


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20907: [WIP][SPARK-11237][ML] Add pmml export for k-mean...

2018-03-31 Thread felixcheung
Github user felixcheung commented on a diff in the pull request:

https://github.com/apache/spark/pull/20907#discussion_r178425007
  
--- Diff: mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
---
@@ -185,6 +187,47 @@ class KMeansModel private[ml] (
   }
 }
 
+/** Helper class for storing model data */
+private case class ClusterData(clusterIdx: Int, clusterCenter: Vector)
+
+
+/** A writer for KMeans that handles the "internal" (or default) format */
+private class InternalKMeansModelWriter extends MLWriterFormat with 
MLFormatRegister {
+
+  override def format(): String = "internal"
+  override def stageName(): String = 
"org.apache.spark.ml.clustering.KMeansModel"
+
+  override def write(path: String, sparkSession: SparkSession,
+optionMap: mutable.Map[String, String], stage: PipelineStage): Unit = {
+val instance = stage.asInstanceOf[KMeansModel]
+val sc = sparkSession.sparkContext
+// Save metadata and Params
+DefaultParamsWriter.saveMetadata(instance, path, sc)
+// Save model data: cluster centers
+val data: Array[ClusterData] = 
instance.clusterCenters.zipWithIndex.map {
+  case (center, idx) =>
+ClusterData(idx, center)
--- End diff --

👍 


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/20949
  
Merged build finished. Test FAILed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/20949
  
Test FAILed.
Refer to this link for build results (access rights to CI server needed): 
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88779/
Test FAILed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...

2018-03-31 Thread SparkQA
Github user SparkQA commented on the issue:

https://github.com/apache/spark/pull/20949
  
**[Test build #88779 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88779/testReport)**
 for PR 20949 at commit 
[`b9a7bf0`](https://github.com/apache/spark/commit/b9a7bf03b312da151e1d7e37338092bbf5bcb38a).
 * This patch **fails Scala style tests**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...

2018-03-31 Thread SparkQA
Github user SparkQA commented on the issue:

https://github.com/apache/spark/pull/20949
  
**[Test build #88779 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88779/testReport)**
 for PR 20949 at commit 
[`b9a7bf0`](https://github.com/apache/spark/commit/b9a7bf03b312da151e1d7e37338092bbf5bcb38a).


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #20949: [SPARK-19018][SQL] Add support for custom encodin...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on a diff in the pull request:

https://github.com/apache/spark/pull/20949#discussion_r178424628
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 ---
@@ -513,6 +513,43 @@ class CSVSuite extends QueryTest with SharedSQLContext 
with SQLTestUtils {
 }
   }
 
+  test("Save csv with custom charset") {
+Seq("iso-8859-1", "utf-8", "windows-1250").foreach { encoding =>
+  withTempDir { dir =>
+val csvDir = new File(dir, "csv").getCanonicalPath
+// scalastyle:off
+val originalDF = Seq("µß áâä ÁÂÄ").toDF("_c0")
+// scalastyle:on
+originalDF.write
+  .option("header", "false")
+  .option("encoding", encoding)
+  .csv(csvDir)
+
+val df = spark
+  .read
+  .option("header", "false")
+  .option("encoding", encoding)
--- End diff --

I think our CSV read encoding option is incomplete for now .. there are 
many discussions about this now. I am going to fix the read path soon. Let me 
revisit this after fixing it.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20949: [SPARK-19018][SQL] Add support for custom encoding on cs...

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on the issue:

https://github.com/apache/spark/pull/20949
  
ok to test


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #20948: corrected filename for spark config

2018-03-31 Thread HyukjinKwon
Github user HyukjinKwon commented on the issue:

https://github.com/apache/spark/pull/20948
  
I think https://github.com/apache/spark/pull/20928 includes this. Let that 
PR fix it and close this one.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #19560: [SPARK-22334][SQL] Check table size from filesystem in c...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/19560
  
Merged build finished. Test PASSed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #19560: [SPARK-22334][SQL] Check table size from filesystem in c...

2018-03-31 Thread AmplabJenkins
Github user AmplabJenkins commented on the issue:

https://github.com/apache/spark/pull/19560
  
Test PASSed.
Refer to this link for build results (access rights to CI server needed): 
https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/88775/
Test PASSed.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark issue #19560: [SPARK-22334][SQL] Check table size from filesystem in c...

2018-03-31 Thread SparkQA
Github user SparkQA commented on the issue:

https://github.com/apache/spark/pull/19560
  
**[Test build #88775 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88775/testReport)**
 for PR 19560 at commit 
[`78b34bd`](https://github.com/apache/spark/commit/78b34bd7b79550b23730e1c9cdf06620e52b66f2).
 * This patch passes all tests.
 * This patch merges cleanly.
 * This patch adds no public classes.


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



<    1   2