This is an automated email from the ASF dual-hosted git repository. sankarh pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 29753d7 HIVE-23887: Reset table level basic/column stats during import (Ashish Sharma, reviewed by Sankar Hariappan) 29753d7 is described below commit 29753d7027508ccbb2b7b06cbc567ceb6f289abd Author: Ashish Kumar Sharma <ashishkumarsharm...@gmail.com> AuthorDate: Fri Aug 21 22:22:57 2020 +0530 HIVE-23887: Reset table level basic/column stats during import (Ashish Sharma, reviewed by Sankar Hariappan) Signed-off-by: Sankar Hariappan <sank...@apache.org> Closes (#1370) --- .../org/apache/hadoop/hive/ql/metadata/Hive.java | 26 +++---- .../org/apache/hadoop/hive/ql/TestTxnExIm.java | 88 +++++++++++++++++++--- 2 files changed, 90 insertions(+), 24 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 054c55c..de8f044 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -2326,11 +2326,6 @@ public class Hive { + "partition that does not exist yet. Skipping generating INSERT event."); } - // column stats will be inaccurate - if (resetStatistics) { - StatsSetupConst.setBasicStatsState(newTPart.getParameters(), StatsSetupConst.FALSE); - } - // recreate the partition if it existed before if (isSkewedStoreAsSubdir) { org.apache.hadoop.hive.metastore.api.Partition newCreatedTpart = newTPart.getTPartition(); @@ -2342,9 +2337,15 @@ public class Hive { skewedInfo.setSkewedColValueLocationMaps(skewedColValueLocationMaps); newCreatedTpart.getSd().setSkewedInfo(skewedInfo); } - if (!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + + // If there is no column stats gather stage present in the plan. So we don't know the accuracy of the stats or + // auto gather stats is turn off explicitly. We need to reset the stats in both cases. + if (resetStatistics || !this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + LOG.debug( + "Clear partition column statistics by setting basic stats to false for " + newTPart.getCompleteName()); StatsSetupConst.setBasicStatsState(newTPart.getParameters(), StatsSetupConst.FALSE); } + if (oldPart == null) { newTPart.getTPartition().setParameters(new HashMap<String,String>()); if (this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { @@ -3067,15 +3068,12 @@ private void constructOneLBLocationMap(FileStatus fSta, } perfLogger.perfLogEnd("MoveTask", PerfLogger.FILE_MOVES); } - if (!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - LOG.debug("setting table statistics false for " + tbl.getDbName() + "." + tbl.getTableName()); - StatsSetupConst.setBasicStatsState(tbl.getParameters(), StatsSetupConst.FALSE); - } - //column stats will be inaccurate - if (resetStatistics) { - LOG.debug("Clearing table statistics for " + tbl.getDbName() + "." + tbl.getTableName()); - StatsSetupConst.clearColumnStatsState(tbl.getParameters()); + // If there is no column stats gather stage present in the plan. So we don't know the accuracy of the stats or + // auto gather stats is turn off explicitly. We need to reset the stats in both cases. + if (resetStatistics || !this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + LOG.debug("Clear table column statistics and set basic statistics to false for " + tbl.getCompleteName()); + StatsSetupConst.setBasicStatsState(tbl.getParameters(), StatsSetupConst.FALSE); } try { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java index fe319b3..5801afa 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java @@ -389,21 +389,52 @@ target/tmp/org.apache.hadoop.hive.ql.TestTxnCommands-1521148657811/ @Test public void testImportPartitionedOrc() throws Exception { + // Clear and drop table T,Tstage runStatementOnDriver("drop table if exists T"); runStatementOnDriver("drop table if exists Tstage"); - runStatementOnDriver("create table T (a int, b int) partitioned by (p int) stored" + - " as orc tblproperties('transactional'='true')"); - //Tstage is the target table - runStatementOnDriver("create table Tstage (a int, b int) partitioned by (p int) stored" + - " as orc tblproperties('transactional'='true')"); - //this creates an ORC data file with correct schema under table root + + // Create source table - Tstage + runStatementOnDriver("create table Tstage (a int, b int) partitioned by (p int) stored" + + " as orc tblproperties('transactional'='true')"); + + // This creates an ORC data file with correct schema under table root runStatementOnDriver("insert into Tstage values(1,2,10),(3,4,11),(5,6,12)"); - final int[][] rows = {{3}}; - //now we have an archive with 3 partitions + final int[][] rows = { { 3 } }; + + // Check Partitions statistics + List<String> rsTstagePartitionsProperties = runStatementOnDriver("show partitions Tstage"); + for (String rsTstagePartition : rsTstagePartitionsProperties) { + List<String> rsPartitionProperties = + runStatementOnDriver("describe formatted Tstage partition(" + rsTstagePartition + ")"); + Assert.assertEquals("COLUMN_STATS_ACCURATE of partition " + rsTstagePartition + " of Tstage table", true, + rsPartitionProperties.contains("\tCOLUMN_STATS_ACCURATE\t{\\\"BASIC_STATS\\\":\\\"true\\\"}")); + Assert.assertEquals(" of partition " + rsTstagePartition + " of Tstage table", true, + rsPartitionProperties.contains("\tnumRows \t1 ")); + } + + // Now we have an archive Tstage with 3 partitions runStatementOnDriver("export table Tstage to '" + getWarehouseDir() + "/1'"); - //load T + // Load T runStatementOnDriver("import table T from '" + getWarehouseDir() + "/1'"); + + // Check basic stats in tblproperties of T + List<String> rsTProperties = runStatementOnDriver("show tblproperties T"); + Assert.assertEquals("COLUMN_STATS_ACCURATE of T table", false, + rsTProperties.contains("COLUMN_STATS_ACCURATE\t{\"BASIC_STATS\":\"true\"}")); + Assert.assertEquals("numRows of T table", false, rsTProperties.contains("numRows\t3")); + + // Check Partitions statistics of T + List<String> rsTPartitionsProperties = runStatementOnDriver("show partitions T"); + for (String rsTPartition : rsTPartitionsProperties) { + List<String> rsPartitionProperties = runStatementOnDriver("describe formatted T partition(" + rsTPartition + ")"); + Assert.assertEquals("COLUMN_STATS_ACCURATE of partition " + rsTPartition + " of T table", false, + rsPartitionProperties.contains("\tCOLUMN_STATS_ACCURATE\t{\\\"BASIC_STATS\\\":\\\"true\\\"}")); + Assert.assertEquals(" of partition " + rsTPartition + " of T table", false, + rsPartitionProperties.contains("\tnumRows \t1 ")); + } + + // Verify the count(*) output List<String> rs = runStatementOnDriver("select count(*) from T"); Assert.assertEquals("Rowcount of imported table", TestTxnCommands2.stringifyValues(rows), rs); } @@ -566,4 +597,41 @@ target/tmp/org.apache.hadoop.hive.ql.TestTxnCommands-1521148657811/ TestTxnCommands2.stringifyValues(data), rs); } -} + + @Test + public void testImportOrc() throws Exception { + // Clear and Drop T and Tstage if exist + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("drop table if exists Tstage"); + + // Create source table - Tstage + runStatementOnDriver("create table Tstage (a int, b int) stored" + " as orc tblproperties('transactional'='true')"); + + // This creates an ORC data file with correct schema under table root + runStatementOnDriver("insert into Tstage values(1,2),(3,4),(5,6)"); + final int[][] rows = { { 3 } }; + + // Check Tstage statistics + List<String> rsTStageProperties = runStatementOnDriver("show tblproperties Tstage"); + Assert.assertEquals("COLUMN_STATS_ACCURATE of Tstage table", true, + rsTStageProperties.contains("COLUMN_STATS_ACCURATE\t{\"BASIC_STATS\":\"true\"}")); + Assert.assertEquals("numRows of Tstage table", true, rsTStageProperties.contains("numRows\t3")); + Assert.assertEquals("numFiles of Tstage table", true, rsTStageProperties.contains("numFiles\t1")); + + // Now we have an archive Tstage table + runStatementOnDriver("export table Tstage to '" + getWarehouseDir() + "/1'"); + + // Load T + runStatementOnDriver("import table T from '" + getWarehouseDir() + "/1'"); + + // Check basic stats in tblproperties T + List<String> rsTProperties = runStatementOnDriver("show tblproperties T"); + Assert.assertEquals("COLUMN_STATS_ACCURATE of T table", false, + rsTProperties.contains("COLUMN_STATS_ACCURATE\t{\"BASIC_STATS\":\"true\"}")); + Assert.assertEquals("numRows of T table", false, rsTProperties.contains("numRows\t3")); + + // Verify the count(*) output + List<String> rs = runStatementOnDriver("select count(*) from T"); + Assert.assertEquals("Rowcount of imported table", TestTxnCommands2.stringifyValues(rows), rs); + } +} \ No newline at end of file