Repository: spark Updated Branches: refs/heads/master 6d7ebf2f9 -> b96f61b6b
[SPARK-22475][SQL] show histogram in DESC COLUMN command ## What changes were proposed in this pull request? Added the histogram representation to the output of the `DESCRIBE EXTENDED table_name column_name` command. ## How was this patch tested? Modified SQL UT and checked output Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Marco Gaido <mga...@hortonworks.com> Closes #19774 from mgaido91/SPARK-22475. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b96f61b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b96f61b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b96f61b6 Branch: refs/heads/master Commit: b96f61b6b262836e6be3f7657a3fe136d58b4dfe Parents: 6d7ebf2 Author: Marco Gaido <mga...@hortonworks.com> Authored: Tue Nov 21 20:55:24 2017 +0100 Committer: Wenchen Fan <wenc...@databricks.com> Committed: Tue Nov 21 20:55:24 2017 +0100 ---------------------------------------------------------------------- .../spark/sql/execution/command/tables.scala | 17 +++++ .../sql-tests/inputs/describe-table-column.sql | 10 +++ .../results/describe-table-column.sql.out | 74 +++++++++++++++++--- 3 files changed, 93 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b96f61b6/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 95f16b0..c9f6e57 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTableType._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.Histogram import org.apache.spark.sql.catalyst.util.quoteIdentifier import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils} import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat @@ -689,9 +690,25 @@ case class DescribeColumnCommand( buffer += Row("distinct_count", cs.map(_.distinctCount.toString).getOrElse("NULL")) buffer += Row("avg_col_len", cs.map(_.avgLen.toString).getOrElse("NULL")) buffer += Row("max_col_len", cs.map(_.maxLen.toString).getOrElse("NULL")) + val histDesc = for { + c <- cs + hist <- c.histogram + } yield histogramDescription(hist) + buffer ++= histDesc.getOrElse(Seq(Row("histogram", "NULL"))) } buffer } + + private def histogramDescription(histogram: Histogram): Seq[Row] = { + val header = Row("histogram", + s"height: ${histogram.height}, num_of_bins: ${histogram.bins.length}") + val bins = histogram.bins.zipWithIndex.map { + case (bin, index) => + Row(s"bin_$index", + s"lower_bound: ${bin.lo}, upper_bound: ${bin.hi}, distinct_count: ${bin.ndv}") + } + header +: bins + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/b96f61b6/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql index a6ddcd9..2d180d1 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql @@ -34,6 +34,16 @@ DESC FORMATTED desc_complex_col_table col; -- Describe a nested column DESC FORMATTED desc_complex_col_table col.x; +-- Test output for histogram statistics +SET spark.sql.statistics.histogram.enabled=true; +SET spark.sql.statistics.histogram.numBins=2; + +INSERT INTO desc_col_table values 1, 2, 3, 4; + +ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key; + +DESC EXTENDED desc_col_table key; + DROP VIEW desc_col_temp_view; DROP TABLE desc_col_table; http://git-wip-us.apache.org/repos/asf/spark/blob/b96f61b6/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out index 30d0a2d..6ef8af6 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 18 +-- Number of queries: 23 -- !query 0 @@ -34,6 +34,7 @@ num_nulls NULL distinct_count NULL avg_col_len NULL max_col_len NULL +histogram NULL -- !query 3 @@ -50,6 +51,7 @@ num_nulls NULL distinct_count NULL avg_col_len NULL max_col_len NULL +histogram NULL -- !query 4 @@ -66,6 +68,7 @@ num_nulls NULL distinct_count NULL avg_col_len NULL max_col_len NULL +histogram NULL -- !query 5 @@ -117,6 +120,7 @@ num_nulls 0 distinct_count 0 avg_col_len 4 max_col_len 4 +histogram NULL -- !query 10 @@ -133,6 +137,7 @@ num_nulls 0 distinct_count 0 avg_col_len 4 max_col_len 4 +histogram NULL -- !query 11 @@ -157,6 +162,7 @@ num_nulls NULL distinct_count NULL avg_col_len NULL max_col_len NULL +histogram NULL -- !query 13 @@ -173,6 +179,7 @@ num_nulls NULL distinct_count NULL avg_col_len NULL max_col_len NULL +histogram NULL -- !query 14 @@ -185,24 +192,75 @@ DESC TABLE COLUMN command does not support nested data types: col.x; -- !query 15 -DROP VIEW desc_col_temp_view +SET spark.sql.statistics.histogram.enabled=true -- !query 15 schema -struct<> +struct<key:string,value:string> -- !query 15 output - +spark.sql.statistics.histogram.enabled true -- !query 16 -DROP TABLE desc_col_table +SET spark.sql.statistics.histogram.numBins=2 -- !query 16 schema -struct<> +struct<key:string,value:string> -- !query 16 output - +spark.sql.statistics.histogram.numBins 2 -- !query 17 -DROP TABLE desc_complex_col_table +INSERT INTO desc_col_table values 1, 2, 3, 4 -- !query 17 schema struct<> -- !query 17 output + + +-- !query 18 +ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key +-- !query 18 schema +struct<> +-- !query 18 output + + + +-- !query 19 +DESC EXTENDED desc_col_table key +-- !query 19 schema +struct<info_name:string,info_value:string> +-- !query 19 output +col_name key +data_type int +comment column_comment +min 1 +max 4 +num_nulls 0 +distinct_count 4 +avg_col_len 4 +max_col_len 4 +histogram height: 2.0, num_of_bins: 2 +bin_0 lower_bound: 1.0, upper_bound: 2.0, distinct_count: 2 +bin_1 lower_bound: 2.0, upper_bound: 4.0, distinct_count: 2 + + +-- !query 20 +DROP VIEW desc_col_temp_view +-- !query 20 schema +struct<> +-- !query 20 output + + + +-- !query 21 +DROP TABLE desc_col_table +-- !query 21 schema +struct<> +-- !query 21 output + + + +-- !query 22 +DROP TABLE desc_complex_col_table +-- !query 22 schema +struct<> +-- !query 22 output + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org