Repository: spark Updated Branches: refs/heads/master e1dd03e42 -> 1d5597b40
[SPARK-22626][SQL][FOLLOWUP] improve documentation and simplify test case ## What changes were proposed in this pull request? This PR improves documentation for not using zero `numRows` statistics and simplifies the test case. The reason why some Hive tables have zero `numRows` is that, in Hive, when stats gathering is disabled, `numRows` is always zero after INSERT command: ``` hive> create table src (key int, value string) stored as orc; hive> desc formatted src; Table Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} numFiles 0 numRows 0 rawDataSize 0 totalSize 0 transient_lastDdlTime 1512399590 hive> set hive.stats.autogather=false; hive> insert into src select 1, 'a'; hive> desc formatted src; Table Parameters: numFiles 1 numRows 0 rawDataSize 0 totalSize 275 transient_lastDdlTime 1512399647 hive> insert into src select 1, 'b'; hive> desc formatted src; Table Parameters: numFiles 2 numRows 0 rawDataSize 0 totalSize 550 transient_lastDdlTime 1512399687 ``` ## How was this patch tested? Modified existing test. Author: Zhenhua Wang <wzh_...@163.com> Closes #19880 from wzhfy/doc_zero_rowCount. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d5597b4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d5597b4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d5597b4 Branch: refs/heads/master Commit: 1d5597b408485e41812f3645a670864ad88570a0 Parents: e1dd03e Author: Zhenhua Wang <wzh_...@163.com> Authored: Mon Dec 4 15:08:07 2017 -0800 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Mon Dec 4 15:08:07 2017 -0800 ---------------------------------------------------------------------- .../apache/spark/sql/hive/client/HiveClientImpl.scala | 8 +++++--- .../org/apache/spark/sql/hive/StatisticsSuite.scala | 11 +++++------ 2 files changed, 10 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/1d5597b4/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 77e8360..08eb5c7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -426,9 +426,11 @@ private[hive] class HiveClientImpl( // TODO: stats should include all the other two fields (`numFiles` and `numPartitions`). // (see StatsSetupConst in Hive) val stats = - // When table is external, `totalSize` is always zero, which will influence join strategy - // so when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero, - // return None. Later, we will use the other ways to estimate the statistics. + // When table is external, `totalSize` is always zero, which will influence join strategy. + // So when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero, + // return None. + // In Hive, when statistics gathering is disabled, `rawDataSize` and `numRows` is always + // zero after INSERT command. So they are used here only if they are larger than zero. if (totalSize.isDefined && totalSize.get > 0L) { Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0))) } else if (rawDataSize.isDefined && rawDataSize.get > 0) { http://git-wip-us.apache.org/repos/asf/spark/blob/1d5597b4/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index ee027e5..13f06a2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -1366,17 +1366,16 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql("CREATE TABLE maybe_big (c1 bigint)" + "TBLPROPERTIES ('numRows'='0', 'rawDataSize'='60000000000', 'totalSize'='8000000000000')") - val relation = spark.table("maybe_big").queryExecution.analyzed.children.head - .asInstanceOf[HiveTableRelation] + val catalogTable = getCatalogTable("maybe_big") - val properties = relation.tableMeta.ignoredProperties + val properties = catalogTable.ignoredProperties assert(properties("totalSize").toLong > 0) assert(properties("rawDataSize").toLong > 0) assert(properties("numRows").toLong == 0) - assert(relation.stats.sizeInBytes > 0) - // May be cause OOM if rowCount == 0 when enables CBO, see SPARK-22626 for details. - assert(relation.stats.rowCount.isEmpty) + val catalogStats = catalogTable.stats.get + assert(catalogStats.sizeInBytes > 0) + assert(catalogStats.rowCount.isEmpty) } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org