This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 29753d7  HIVE-23887: Reset table level basic/column stats during 
import (Ashish Sharma, reviewed by Sankar Hariappan)
29753d7 is described below

commit 29753d7027508ccbb2b7b06cbc567ceb6f289abd
Author: Ashish Kumar Sharma <ashishkumarsharm...@gmail.com>
AuthorDate: Fri Aug 21 22:22:57 2020 +0530

    HIVE-23887: Reset table level basic/column stats during import (Ashish 
Sharma, reviewed by Sankar Hariappan)
    
    Signed-off-by: Sankar Hariappan <sank...@apache.org>
    Closes (#1370)
---
 .../org/apache/hadoop/hive/ql/metadata/Hive.java   | 26 +++----
 .../org/apache/hadoop/hive/ql/TestTxnExIm.java     | 88 +++++++++++++++++++---
 2 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java 
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index 054c55c..de8f044 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -2326,11 +2326,6 @@ public class Hive {
                 + "partition that does not exist yet. Skipping generating 
INSERT event.");
       }
 
-      // column stats will be inaccurate
-      if (resetStatistics) {
-        StatsSetupConst.setBasicStatsState(newTPart.getParameters(), 
StatsSetupConst.FALSE);
-      }
-
       // recreate the partition if it existed before
       if (isSkewedStoreAsSubdir) {
         org.apache.hadoop.hive.metastore.api.Partition newCreatedTpart = 
newTPart.getTPartition();
@@ -2342,9 +2337,15 @@ public class Hive {
         skewedInfo.setSkewedColValueLocationMaps(skewedColValueLocationMaps);
         newCreatedTpart.getSd().setSkewedInfo(skewedInfo);
       }
-      if (!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+
+      // If there is no column stats gather stage present in the plan. So we 
don't know the accuracy of the stats or
+      // auto gather stats is turn off explicitly. We need to reset the stats 
in both cases.
+      if (resetStatistics || 
!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+        LOG.debug(
+            "Clear partition column statistics by setting basic stats to false 
for " + newTPart.getCompleteName());
         StatsSetupConst.setBasicStatsState(newTPart.getParameters(), 
StatsSetupConst.FALSE);
       }
+
       if (oldPart == null) {
         newTPart.getTPartition().setParameters(new HashMap<String,String>());
         if (this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
@@ -3067,15 +3068,12 @@ private void constructOneLBLocationMap(FileStatus fSta,
       }
       perfLogger.perfLogEnd("MoveTask", PerfLogger.FILE_MOVES);
     }
-    if (!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
-      LOG.debug("setting table statistics false for " + tbl.getDbName() + "." 
+ tbl.getTableName());
-      StatsSetupConst.setBasicStatsState(tbl.getParameters(), 
StatsSetupConst.FALSE);
-    }
 
-    //column stats will be inaccurate
-    if (resetStatistics) {
-      LOG.debug("Clearing table statistics for " + tbl.getDbName() + "." + 
tbl.getTableName());
-      StatsSetupConst.clearColumnStatsState(tbl.getParameters());
+    // If there is no column stats gather stage present in the plan. So we 
don't know the accuracy of the stats or
+    // auto gather stats is turn off explicitly. We need to reset the stats in 
both cases.
+    if (resetStatistics || 
!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+      LOG.debug("Clear table column statistics and set basic statistics to 
false for " + tbl.getCompleteName());
+      StatsSetupConst.setBasicStatsState(tbl.getParameters(), 
StatsSetupConst.FALSE);
     }
 
     try {
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java 
b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java
index fe319b3..5801afa 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnExIm.java
@@ -389,21 +389,52 @@ 
target/tmp/org.apache.hadoop.hive.ql.TestTxnCommands-1521148657811/
 
   @Test
   public void testImportPartitionedOrc() throws Exception {
+    // Clear and drop table T,Tstage
     runStatementOnDriver("drop table if exists T");
     runStatementOnDriver("drop table if exists Tstage");
-    runStatementOnDriver("create table T (a int, b int) partitioned by (p int) 
stored" +
-        " as orc tblproperties('transactional'='true')");
-    //Tstage is the target table
-    runStatementOnDriver("create table Tstage (a int, b int) partitioned by (p 
int) stored" +
-        " as orc tblproperties('transactional'='true')");
-    //this creates an ORC data file with correct schema under table root
+
+    // Create source table - Tstage
+    runStatementOnDriver("create table Tstage (a int, b int) partitioned by (p 
int) stored"
+        + " as orc tblproperties('transactional'='true')");
+
+    // This creates an ORC data file with correct schema under table root
     runStatementOnDriver("insert into Tstage 
values(1,2,10),(3,4,11),(5,6,12)");
-    final int[][] rows = {{3}};
-    //now we have an archive with 3 partitions
+    final int[][] rows = { { 3 } };
+
+    // Check Partitions statistics
+    List<String> rsTstagePartitionsProperties = runStatementOnDriver("show 
partitions Tstage");
+    for (String rsTstagePartition : rsTstagePartitionsProperties) {
+      List<String> rsPartitionProperties =
+          runStatementOnDriver("describe formatted Tstage partition(" + 
rsTstagePartition + ")");
+      Assert.assertEquals("COLUMN_STATS_ACCURATE of partition " + 
rsTstagePartition + " of Tstage table", true,
+          
rsPartitionProperties.contains("\tCOLUMN_STATS_ACCURATE\t{\\\"BASIC_STATS\\\":\\\"true\\\"}"));
+      Assert.assertEquals(" of partition " + rsTstagePartition + " of Tstage 
table", true,
+          rsPartitionProperties.contains("\tnumRows             \t1            
       "));
+    }
+
+    // Now we have an archive Tstage with 3 partitions
     runStatementOnDriver("export table Tstage to '" + getWarehouseDir() + 
"/1'");
 
-    //load T
+    // Load T
     runStatementOnDriver("import table T from '" + getWarehouseDir() + "/1'");
+
+    // Check basic stats in tblproperties of T
+    List<String> rsTProperties = runStatementOnDriver("show tblproperties T");
+    Assert.assertEquals("COLUMN_STATS_ACCURATE of T table", false,
+        
rsTProperties.contains("COLUMN_STATS_ACCURATE\t{\"BASIC_STATS\":\"true\"}"));
+    Assert.assertEquals("numRows of T table", false, 
rsTProperties.contains("numRows\t3"));
+
+    // Check Partitions statistics of T
+    List<String> rsTPartitionsProperties = runStatementOnDriver("show 
partitions T");
+    for (String rsTPartition : rsTPartitionsProperties) {
+      List<String> rsPartitionProperties = runStatementOnDriver("describe 
formatted T partition(" + rsTPartition + ")");
+      Assert.assertEquals("COLUMN_STATS_ACCURATE of partition " + rsTPartition 
+ " of T table", false,
+          
rsPartitionProperties.contains("\tCOLUMN_STATS_ACCURATE\t{\\\"BASIC_STATS\\\":\\\"true\\\"}"));
+      Assert.assertEquals(" of partition " + rsTPartition + " of T table", 
false,
+          rsPartitionProperties.contains("\tnumRows             \t1            
       "));
+    }
+
+    // Verify the count(*) output
     List<String> rs = runStatementOnDriver("select count(*) from T");
     Assert.assertEquals("Rowcount of imported table", 
TestTxnCommands2.stringifyValues(rows), rs);
   }
@@ -566,4 +597,41 @@ 
target/tmp/org.apache.hadoop.hive.ql.TestTxnCommands-1521148657811/
         TestTxnCommands2.stringifyValues(data), rs);
 
   }
-}
+
+  @Test
+  public void testImportOrc() throws Exception {
+    // Clear and Drop T and Tstage if exist
+    runStatementOnDriver("drop table if exists T");
+    runStatementOnDriver("drop table if exists Tstage");
+
+    // Create source table - Tstage
+    runStatementOnDriver("create table Tstage (a int, b int) stored" + " as 
orc tblproperties('transactional'='true')");
+
+    // This creates an ORC data file with correct schema under table root
+    runStatementOnDriver("insert into Tstage values(1,2),(3,4),(5,6)");
+    final int[][] rows = { { 3 } };
+
+    // Check Tstage statistics
+    List<String> rsTStageProperties = runStatementOnDriver("show tblproperties 
Tstage");
+    Assert.assertEquals("COLUMN_STATS_ACCURATE of Tstage table", true,
+        
rsTStageProperties.contains("COLUMN_STATS_ACCURATE\t{\"BASIC_STATS\":\"true\"}"));
+    Assert.assertEquals("numRows of Tstage table", true, 
rsTStageProperties.contains("numRows\t3"));
+    Assert.assertEquals("numFiles of Tstage table", true, 
rsTStageProperties.contains("numFiles\t1"));
+
+    // Now we have an archive Tstage table
+    runStatementOnDriver("export table Tstage to '" + getWarehouseDir() + 
"/1'");
+
+    // Load T
+    runStatementOnDriver("import table T from '" + getWarehouseDir() + "/1'");
+
+    // Check basic stats in tblproperties T
+    List<String> rsTProperties = runStatementOnDriver("show tblproperties T");
+    Assert.assertEquals("COLUMN_STATS_ACCURATE of T table", false,
+        
rsTProperties.contains("COLUMN_STATS_ACCURATE\t{\"BASIC_STATS\":\"true\"}"));
+    Assert.assertEquals("numRows of T table", false, 
rsTProperties.contains("numRows\t3"));
+
+    // Verify the count(*) output
+    List<String> rs = runStatementOnDriver("select count(*) from T");
+    Assert.assertEquals("Rowcount of imported table", 
TestTxnCommands2.stringifyValues(rows), rs);
+  }
+}
\ No newline at end of file

Reply via email to