spark git commit: [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows
Repository: spark Updated Branches: refs/heads/branch-2.0 29681cca1 -> 4939c2a12 [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows ## What changes were proposed in this pull request? This PR addresses two related issues: 1. `Dataset.showString()` should show case classes/Java beans at all levels as rows, while master code only handles top level ones. 2. `Dataset.showString()` should show full contents produced the underlying query plan Dataset is only a view of the underlying query plan. Columns not referred by the encoder are still reachable using methods like `Dataset.col`. So it probably makes more sense to show full contents of the query plan. ## How was this patch tested? Two new test cases are added in `DatasetSuite` to check `.showString()` output. Author: Cheng LianCloses #13331 from liancheng/spark-15550-ds-show. (cherry picked from commit e7082caeb4a53c1ee172d136894eece1ac880f65) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4939c2a1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4939c2a1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4939c2a1 Branch: refs/heads/branch-2.0 Commit: 4939c2a12c854a87a020a7e759e4f87810f16710 Parents: 29681cc Author: Cheng Lian Authored: Thu May 26 16:23:48 2016 -0700 Committer: Cheng Lian Committed: Thu May 26 16:23:56 2016 -0700 -- .../scala/org/apache/spark/sql/Dataset.scala| 10 +-- .../org/apache/spark/sql/DatasetSuite.scala | 68 ++-- 2 files changed, 52 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4939c2a1/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 961ae32..85f0cf8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -237,19 +237,13 @@ class Dataset[T] private[sql]( */ private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = { val numRows = _numRows.max(0) -val takeResult = take(numRows + 1) +val takeResult = toDF().take(numRows + 1) val hasMoreData = takeResult.length > numRows val data = takeResult.take(numRows) // For array values, replace Seq and Array with square brackets // For cells that are beyond 20 characters, replace it with the first 17 and "..." -val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { - case r: Row => r - case tuple: Product => Row.fromTuple(tuple) - case definedByCtor: DefinedByConstructorParams => - Row.fromSeq(ScalaReflection.getConstructorParameterValues(definedByCtor)) - case o => Row(o) -}.map { row => +val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row => row.toSeq.map { cell => val str = cell match { case null => "null" http://git-wip-us.apache.org/repos/asf/spark/blob/4939c2a1/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 05de79e..32320a6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp} import scala.language.postfixOps -import org.scalatest.words.MatcherWords.be - import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder} +import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext @@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() checkDataset( ds.filter(_._1 == "b").select(expr("_1").as[String]), - ("b")) + "b") } test("foreach") { @@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext { assert(ds.toString == "[_1: int, _2: int]") } - test("showString: Kryo encoder") { -implicit val kryoEncoder = Encoders.kryo[KryoData] -val ds = Seq(KryoData(1), KryoData(2)).toDS() - -val expectedAnswer = """+---+ - || value| - |+---+ -
spark git commit: [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows
Repository: spark Updated Branches: refs/heads/master fe6de16f7 -> e7082caeb [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows ## What changes were proposed in this pull request? This PR addresses two related issues: 1. `Dataset.showString()` should show case classes/Java beans at all levels as rows, while master code only handles top level ones. 2. `Dataset.showString()` should show full contents produced the underlying query plan Dataset is only a view of the underlying query plan. Columns not referred by the encoder are still reachable using methods like `Dataset.col`. So it probably makes more sense to show full contents of the query plan. ## How was this patch tested? Two new test cases are added in `DatasetSuite` to check `.showString()` output. Author: Cheng LianCloses #13331 from liancheng/spark-15550-ds-show. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7082cae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7082cae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7082cae Branch: refs/heads/master Commit: e7082caeb4a53c1ee172d136894eece1ac880f65 Parents: fe6de16 Author: Cheng Lian Authored: Thu May 26 16:23:48 2016 -0700 Committer: Cheng Lian Committed: Thu May 26 16:23:48 2016 -0700 -- .../scala/org/apache/spark/sql/Dataset.scala| 10 +-- .../org/apache/spark/sql/DatasetSuite.scala | 68 ++-- 2 files changed, 52 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e7082cae/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 961ae32..85f0cf8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -237,19 +237,13 @@ class Dataset[T] private[sql]( */ private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = { val numRows = _numRows.max(0) -val takeResult = take(numRows + 1) +val takeResult = toDF().take(numRows + 1) val hasMoreData = takeResult.length > numRows val data = takeResult.take(numRows) // For array values, replace Seq and Array with square brackets // For cells that are beyond 20 characters, replace it with the first 17 and "..." -val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { - case r: Row => r - case tuple: Product => Row.fromTuple(tuple) - case definedByCtor: DefinedByConstructorParams => - Row.fromSeq(ScalaReflection.getConstructorParameterValues(definedByCtor)) - case o => Row(o) -}.map { row => +val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row => row.toSeq.map { cell => val str = cell match { case null => "null" http://git-wip-us.apache.org/repos/asf/spark/blob/e7082cae/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 05de79e..32320a6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp} import scala.language.postfixOps -import org.scalatest.words.MatcherWords.be - import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder} +import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext @@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() checkDataset( ds.filter(_._1 == "b").select(expr("_1").as[String]), - ("b")) + "b") } test("foreach") { @@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext { assert(ds.toString == "[_1: int, _2: int]") } - test("showString: Kryo encoder") { -implicit val kryoEncoder = Encoders.kryo[KryoData] -val ds = Seq(KryoData(1), KryoData(2)).toDS() - -val expectedAnswer = """+---+ - || value| - |+---+ - ||KryoData(1)| - ||KryoData(2)| -