This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 43dcb91 [SPARK-19678][FOLLOW-UP][SQL] Add behavior change test when table statistics are incorrect 43dcb91 is described below commit 43dcb91a4cb25aa7e1cc5967194f098029a0361e Author: Yuming Wang <yumw...@ebay.com> AuthorDate: Fri Mar 8 11:47:49 2019 +0800 [SPARK-19678][FOLLOW-UP][SQL] Add behavior change test when table statistics are incorrect ## What changes were proposed in this pull request? Since Spark 2.2.0 ([SPARK-19678](https://issues.apache.org/jira/browse/SPARK-19678)), the below SQL changed from `broadcast join` to `sort merge join`: ```sql -- small external table with incorrect statistics CREATE EXTERNAL TABLE t1(c1 int) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ( 'serialization.format' = '1' ) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 'file:///tmp/t1' TBLPROPERTIES ( 'rawDataSize'='-1', 'numFiles'='0', 'totalSize'='0', 'COLUMN_STATS_ACCURATE'='false', 'numRows'='-1' ); -- big table CREATE TABLE t2 (c1 int) LOCATION 'file:///tmp/t2' TBLPROPERTIES ( 'rawDataSize'='23437737', 'numFiles'='12222', 'totalSize'='333442230', 'COLUMN_STATS_ACCURATE'='false', 'numRows'='443442223' ); explain SELECT t1.c1 FROM t1 INNER JOIN t2 ON t1.c1 = t2.c1; ``` This pr add a test case for this behavior change. ## How was this patch tested? unit tests Closes #24003 from wangyum/SPARK-19678. Authored-by: Yuming Wang <yumw...@ebay.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/hive/StatisticsSuite.scala | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index db2024e..630f02c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -110,6 +110,41 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } } + test("Hive serde table with incorrect statistics") { + withTempDir { tempDir => + withTable("t1") { + spark.range(5).write.mode(SaveMode.Overwrite).parquet(tempDir.getCanonicalPath) + val dataSize = tempDir.listFiles.filter(!_.getName.endsWith(".crc")).map(_.length).sum + spark.sql( + s""" + |CREATE EXTERNAL TABLE t1(id BIGINT) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |LOCATION '${tempDir.getCanonicalPath}' + |TBLPROPERTIES ( + |'rawDataSize'='-1', 'numFiles'='0', 'totalSize'='0', + |'COLUMN_STATS_ACCURATE'='false', 'numRows'='-1' + |)""".stripMargin) + + spark.sql("REFRESH TABLE t1") + // Before SPARK-19678, sizeInBytes should be equal to dataSize. + // After SPARK-19678, sizeInBytes should be equal to DEFAULT_SIZE_IN_BYTES. + val relation1 = spark.table("t1").queryExecution.analyzed.children.head + assert(relation1.stats.sizeInBytes === spark.sessionState.conf.defaultSizeInBytes) + + spark.sql("REFRESH TABLE t1") + // After SPARK-19678 and enable ENABLE_FALL_BACK_TO_HDFS_FOR_STATS, + // sizeInBytes should be equal to dataSize. + withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") { + val relation2 = spark.table("t1").queryExecution.analyzed.children.head + assert(relation2.stats.sizeInBytes === dataSize) + } + } + } + } + test("analyze Hive serde tables") { def queryTotalSize(tableName: String): BigInt = spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org