(hive) branch master updated: HIVE-28268: Iceberg: Retrieve row count from iceberg SnapshotSummary in case of iceberg.hive.keep.stats=false (Butao Zhang, reviewed by Denys Kuzmenko)

dkuzmenko Fri, 25 Oct 2024 07:51:20 -0700

This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new b431e11eb19 HIVE-28268: Iceberg: Retrieve row count from iceberg 
SnapshotSummary in case of iceberg.hive.keep.stats=false (Butao Zhang, reviewed 
by Denys Kuzmenko)
b431e11eb19 is described below

commit b431e11eb19def7df978547bd161ba102d59083c
Author: Butao Zhang <[email protected]>
AuthorDate: Fri Oct 25 22:49:37 2024 +0800

    HIVE-28268: Iceberg: Retrieve row count from iceberg SnapshotSummary in 
case of iceberg.hive.keep.stats=false (Butao Zhang, reviewed by Denys Kuzmenko)
    
    Closes #5215
---
 .../iceberg/mr/hive/HiveIcebergStorageHandler.java |  60 ++++----
 .../apache/iceberg/mr/hive/IcebergTableUtil.java   |  35 +++++
 .../src/test/queries/positive/iceberg_stats.q      |  22 +++
 .../src/test/results/positive/iceberg_stats.q.out  | 159 +++++++++++++++++++++
 .../results/positive/write_iceberg_branch.q.out    |  50 +++----
 .../apache/hadoop/hive/ql/metadata/HiveUtils.java  |   7 +-
 .../hadoop/hive/ql/optimizer/StatsOptimizer.java   |  11 +-
 .../apache/hadoop/hive/ql/stats/StatsUtils.java    |   9 +-
 8 files changed, 287 insertions(+), 66 deletions(-)

diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index b70bd15179f..fc7a9df77d2 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -464,41 +464,44 @@ public class HiveIcebergStorageHandler implements 
HiveStoragePredicateHandler, H
   public Map<String, String> getBasicStatistics(Partish partish) {
     org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
     // For write queries where rows got modified, don't fetch from cache as 
values could have changed.
-    Table table = getTable(hmsTable);
     Map<String, String> stats = Maps.newHashMap();
-    if (getStatsSource().equals(HiveMetaHook.ICEBERG)) {
-      if (table.currentSnapshot() != null) {
-        Map<String, String> summary = table.currentSnapshot().summary();
-        if (summary != null) {
+    if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) {
+      return hmsTable.getParameters();
+    }
+    Table table = getTable(hmsTable);
 
-          if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
-            stats.put(StatsSetupConst.NUM_FILES, 
summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
-          }
+    Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table);
+    if (snapshot != null) {
+      Map<String, String> summary = snapshot.summary();
+      if (summary != null) {
+
+        if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
+          stats.put(StatsSetupConst.NUM_FILES, 
summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
+        }
 
-          if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
-            long totalRecords = 
Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
-            if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
-                summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
+        if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
+          long totalRecords = 
Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
+          if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
+              summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
 
-              long totalEqDeletes = 
Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
-              long totalPosDeletes = 
Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
+            long totalEqDeletes = 
Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
+            long totalPosDeletes = 
Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
 
-              long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : 
totalPosDeletes);
-              totalRecords = actualRecords > 0 ? actualRecords : totalRecords;
-              // actualRecords maybe -ve in edge cases
-            }
-            stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords));
+            long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : 
totalPosDeletes);
+            totalRecords = actualRecords > 0 ? actualRecords : totalRecords;
+            // actualRecords maybe -ve in edge cases
           }
+          stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords));
+        }
 
-          if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
-            stats.put(StatsSetupConst.TOTAL_SIZE, 
summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
-          }
+        if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
+          stats.put(StatsSetupConst.TOTAL_SIZE, 
summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
         }
-      } else {
-        stats.put(StatsSetupConst.NUM_FILES, "0");
-        stats.put(StatsSetupConst.ROW_COUNT, "0");
-        stats.put(StatsSetupConst.TOTAL_SIZE, "0");
       }
+    } else {
+      stats.put(StatsSetupConst.NUM_FILES, "0");
+      stats.put(StatsSetupConst.ROW_COUNT, "0");
+      stats.put(StatsSetupConst.TOTAL_SIZE, "0");
     }
     return stats;
   }
@@ -613,8 +616,9 @@ public class HiveIcebergStorageHandler implements 
HiveStoragePredicateHandler, H
   public boolean 
canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
     if (getStatsSource().equals(HiveMetaHook.ICEBERG) && 
hmsTable.getMetaTable() == null) {
       Table table = getTable(hmsTable);
-      if (table.currentSnapshot() != null) {
-        Map<String, String> summary = table.currentSnapshot().summary();
+      Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table);
+      if (snapshot != null) {
+        Map<String, String> summary = snapshot.summary();
         if (summary != null && 
summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
             summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
 
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 773ae553b2b..d6ddf637a90 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -20,6 +20,7 @@
 package org.apache.iceberg.mr.hive;
 
 import java.io.IOException;
+import java.time.ZoneId;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
@@ -31,6 +32,9 @@ import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.TimestampTZ;
+import org.apache.hadoop.hive.common.type.TimestampTZUtil;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
@@ -42,6 +46,7 @@ import org.apache.hadoop.hive.ql.parse.AlterTableExecuteSpec;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.parse.TransformSpec;
 import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.session.SessionStateUtil;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DeleteFile;
@@ -59,6 +64,7 @@ import org.apache.iceberg.PositionDeletesScanTask;
 import org.apache.iceberg.RowLevelOperationMode;
 import org.apache.iceberg.ScanTask;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.StructLike;
 import org.apache.iceberg.Table;
@@ -79,6 +85,7 @@ import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.PropertyUtil;
+import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.iceberg.util.StructProjection;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -567,4 +574,32 @@ public class IcebergTableUtil {
       throw new SemanticException(String.format("Error while fetching the 
partitions due to: %s", e));
     }
   }
+
+  public static Snapshot 
getTableSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table 
table) {
+    String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef());
+    Snapshot snapshot;
+    if (refName != null) {
+      snapshot = table.snapshot(refName);
+    } else if (hmsTable.getAsOfTimestamp() != null) {
+      ZoneId timeZone = SessionState.get() == null ? new 
HiveConf().getLocalTimeZone() :
+          SessionState.get().getConf().getLocalTimeZone();
+      TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(), 
timeZone);
+      long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table, 
time.toEpochMilli());
+      snapshot = table.snapshot(snapshotId);
+    } else if (hmsTable.getAsOfVersion() != null) {
+      try {
+        snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion()));
+      } catch (NumberFormatException e) {
+        SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion());
+        if (ref == null) {
+          throw new RuntimeException("Cannot find matching snapshot ID or 
reference name for version " +
+              hmsTable.getAsOfVersion());
+        }
+        snapshot = table.snapshot(ref.snapshotId());
+      }
+    } else {
+      snapshot = table.currentSnapshot();
+    }
+    return snapshot;
+  }
 }
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q 
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
index de88018f32e..6fc965e1745 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
@@ -28,5 +28,27 @@ select count(*) from ice01;
 insert overwrite table ice01 select * from ice01;
 explain select count(*) from ice01;
 
+-- false means that count(*) query won't use row count stored in HMS
+set iceberg.hive.keep.stats=false;
+
+create external table ice03 (id int, key int) Stored by Iceberg stored as ORC
+  TBLPROPERTIES('format-version'='2');
+
+insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1);
+-- Iceberg table can utilize fetch task to directly retrieve the row count 
from iceberg SnapshotSummary
+explain select count(*) from ice03;
+select count(*) from ice03;
+
+-- delete some values
+delete from ice03 where id in (2,4);
+
+explain select count(*) from ice03;
+select count(*) from ice03;
+
+-- iow
+insert overwrite table ice03 select * from ice03;
+explain select count(*) from ice03;
+
 drop table ice01;
 drop table ice02;
+drop table ice03;
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out 
b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
index 33c60b54608..4e5b7094501 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
@@ -192,6 +192,155 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: create external table ice03 (id int, key int) Stored by 
Iceberg stored as ORC
+  TBLPROPERTIES('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice03
+POSTHOOK: query: create external table ice03 (id int, key int) Stored by 
Iceberg stored as ORC
+  TBLPROPERTIES('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice03
+PREHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice03
+POSTHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5
+PREHOOK: query: delete from ice03 where id in (2,4)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: default@ice03
+POSTHOOK: query: delete from ice03 where id in (2,4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: ice03
+                  Statistics: Num rows: 3 Data size: #Masked# Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    Statistics: Num rows: 3 Data size: #Masked# Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      aggregations: count()
+                      minReductionHashAggr: 0.6666666
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: #Masked# Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: #Masked# Basic 
stats: COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: bigint)
+            Execution mode: vectorized
+        Reducer 2 
+            Execution mode: vectorized
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: #Masked# Basic stats: 
COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: #Masked# Basic stats: 
COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: insert overwrite table ice03 select * from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: default@ice03
+POSTHOOK: query: insert overwrite table ice03 select * from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: drop table ice01
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@ice01
@@ -212,3 +361,13 @@ POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@ice02
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@ice02
+PREHOOK: query: drop table ice03
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@ice03
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice03
+POSTHOOK: query: drop table ice03
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice03
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out 
b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
index bc662c426a4..cbf1e0562d1 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
@@ -234,17 +234,17 @@ STAGE PLANS:
                   alias: ice01
                   filterExpr: (a = 22) (type: boolean)
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 5 Data size: 485 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Filter Operator
                     predicate: (a = 22) (type: boolean)
-                    Statistics: Num rows: 2 Data size: 194 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 3 Data size: 291 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: PARTITION__SPEC__ID (type: int), 
PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: 
bigint), PARTITION__PROJECTION (type: string), 22 (type: int), b (type: 
string), c (type: int)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7
-                      Statistics: Num rows: 2 Data size: 970 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 3 Data size: 1455 Basic stats: 
COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 2 Data size: 970 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 3 Data size: 1455 Basic stats: 
COMPLETE Column stats: COMPLETE
                         table:
                             input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
                             output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -316,7 +316,7 @@ STAGE PLANS:
                   alias: ice01
                   filterExpr: (c = 66) (type: boolean)
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Filter Operator
                     predicate: (c = 66) (type: boolean)
                     Statistics: Num rows: 2 Data size: 194 Basic stats: 
COMPLETE Column stats: COMPLETE
@@ -451,20 +451,20 @@ STAGE PLANS:
                   alias: ice01
                   filterExpr: a is not null (type: boolean)
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Filter Operator
                     predicate: a is not null (type: boolean)
-                    Statistics: Num rows: 3 Data size: 291 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 4 Data size: 388 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: PARTITION__SPEC__ID (type: int), 
PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: 
bigint), PARTITION__PROJECTION (type: string), a (type: int), b (type: string), 
c (type: int)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7
-                      Statistics: Num rows: 3 Data size: 1455 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 4 Data size: 1940 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col5 (type: int)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col5 (type: int)
-                        Statistics: Num rows: 3 Data size: 1455 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 4 Data size: 1940 Basic stats: 
COMPLETE Column stats: COMPLETE
                         value expressions: _col0 (type: int), _col1 (type: 
bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string), 
_col6 (type: string), _col7 (type: int)
             Execution mode: vectorized
         Reducer 2 
@@ -476,11 +476,11 @@ STAGE PLANS:
                   0 _col0 (type: int)
                   1 _col5 (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 3 Data size: 1743 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 4 Data size: 2324 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
                   expressions: _col1 (type: string), _col0 (type: int), _col5 
(type: string), _col7 (type: string), _col2 (type: int), _col6 (type: bigint), 
_col4 (type: bigint), _col3 (type: int), _col10 (type: int), _col9 (type: 
string), _col8 (type: int)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10
-                  Statistics: Num rows: 3 Data size: 1743 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 2324 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Filter Operator
                     predicate: ((_col10 = _col1) and (_col10 > 100)) (type: 
boolean)
                     Statistics: Num rows: 1 Data size: 581 Basic stats: 
COMPLETE Column stats: COMPLETE
@@ -498,14 +498,14 @@ STAGE PLANS:
                             name: default.ice01
                   Filter Operator
                     predicate: ((_col10 = _col1) and (_col10 <= 100)) (type: 
boolean)
-                    Statistics: Num rows: 1 Data size: 581 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1162 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: _col7 (type: int), _col6 (type: bigint), 
_col2 (type: string), _col5 (type: bigint), _col3 (type: string), _col10 (type: 
int), _col9 (type: string), _col8 (type: int)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7
-                      Statistics: Num rows: 1 Data size: 485 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 970 Basic stats: 
COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 1 Data size: 485 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 970 Basic stats: 
COMPLETE Column stats: COMPLETE
                         table:
                             input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
                             output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -513,14 +513,14 @@ STAGE PLANS:
                             name: default.ice01
                   Filter Operator
                     predicate: ((_col10 = _col1) and (_col10 <= 100)) (type: 
boolean)
-                    Statistics: Num rows: 1 Data size: 581 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1162 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: _col10 (type: int), 'Merged' (type: 
string), (_col8 + 10) (type: int)
                       outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 1 Data size: 98 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 196 Basic stats: 
COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 1 Data size: 98 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 196 Basic stats: 
COMPLETE Column stats: COMPLETE
                         table:
                             input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
                             output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -543,24 +543,24 @@ STAGE PLANS:
                             name: default.ice01
                   Filter Operator
                     predicate: (_col10 = _col1) (type: boolean)
-                    Statistics: Num rows: 1 Data size: 581 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1162 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: _col2 (type: string), _col5 (type: bigint), 
_col6 (type: bigint), _col7 (type: int)
                       outputColumnNames: _col2, _col5, _col6, _col7
-                      Statistics: Num rows: 1 Data size: 581 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 1162 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Group By Operator
                         aggregations: count()
                         keys: _col7 (type: int), _col6 (type: bigint), _col2 
(type: string), _col5 (type: bigint)
                         minReductionHashAggr: 0.4
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                        Statistics: Num rows: 1 Data size: 212 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 424 Basic stats: 
COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: int), _col1 (type: 
bigint), _col2 (type: string), _col3 (type: bigint)
                           null sort order: zzzz
                           sort order: ++++
                           Map-reduce partition columns: _col0 (type: int), 
_col1 (type: bigint), _col2 (type: string), _col3 (type: bigint)
-                          Statistics: Num rows: 1 Data size: 212 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 424 Basic stats: 
COMPLETE Column stats: COMPLETE
                           value expressions: _col4 (type: bigint)
         Reducer 3 
             Execution mode: vectorized
@@ -570,7 +570,7 @@ STAGE PLANS:
                 keys: KEY._col0 (type: int), KEY._col1 (type: bigint), 
KEY._col2 (type: string), KEY._col3 (type: bigint)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Filter Operator
                   predicate: (_col4 > 1L) (type: boolean)
                   Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE 
Column stats: COMPLETE
@@ -795,14 +795,14 @@ STAGE PLANS:
                 TableScan
                   alias: ice01
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
                     expressions: a (type: int), b (type: string), c (type: int)
                     outputColumnNames: _col0, _col1, _col2
-                    Statistics: Num rows: 3 Data size: 285 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 1 Data size: 95 Basic stats: 
COMPLETE Column stats: COMPLETE
                     File Output Operator
                       compressed: false
-                      Statistics: Num rows: 3 Data size: 285 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 1 Data size: 95 Basic stats: 
COMPLETE Column stats: COMPLETE
                       table:
                           input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                           output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
index 5343a1bb3bb..e4076d11367 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
@@ -518,11 +518,8 @@ public final class HiveUtils {
   }
 
   public static String getTableSnapshotRef(String refName) {
-    Matcher ref = SNAPSHOT_REF.matcher(refName);
-    if (ref.matches()) {
-      return ref.group(1);
-    }
-    return null;
+    Matcher ref = SNAPSHOT_REF.matcher(String.valueOf(refName));
+    return ref.matches() ? ref.group(1) : null;
   }
 
   public static Boolean isTableTag(String refName) {
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 38d67660c63..1de37c42105 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
 import org.apache.hadoop.hive.ql.plan.FetchWork;
 import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.stats.Partish;
 import org.apache.hadoop.hive.ql.stats.StatsUtils;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
@@ -942,10 +943,16 @@ public class StatsOptimizer extends Transform {
           rowCnt += partRowCnt;
         }
       } else { // unpartitioned table
-        if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, 
tbl.getParameters())) {
+        Map<String, String> basicStats = tbl.getParameters();
+        if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())) {
+          if (!tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
+            return null;
+          }
+          basicStats = 
tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl));
+        } else if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, 
tbl.getParameters())) {
           return null;
         }
-        rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+        rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT));
       }
       return rowCnt;
     }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 239f57b69b3..81fde429cb3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -2036,10 +2036,7 @@ public class StatsUtils {
 
   public static boolean checkCanProvideStats(Table table) {
     if (MetaStoreUtils.isExternalTable(table.getTTable())) {
-      if (MetaStoreUtils.isNonNativeTable(table.getTTable()) && 
table.getStorageHandler().canProvideBasicStatistics()) {
-        return true;
-      }
-      return false;
+      return MetaStoreUtils.isNonNativeTable(table.getTTable()) && 
table.getStorageHandler().canProvideBasicStatistics();
     }
     return true;
   }
@@ -2049,7 +2046,7 @@ public class StatsUtils {
    * Can run additional checks compared to the version in StatsSetupConst.
    */
   public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, 
Map<String, String> params) {
-    return checkCanProvideStats(table) == true ? 
StatsSetupConst.areBasicStatsUptoDate(params) : false;
+    return checkCanProvideStats(table) && 
StatsSetupConst.areBasicStatsUptoDate(params);
   }
 
   /**
@@ -2057,7 +2054,7 @@ public class StatsUtils {
    * Can run additional checks compared to the version in StatsSetupConst.
    */
   public static boolean areColumnStatsUptoDateForQueryAnswering(Table table, 
Map<String, String> params, String colName) {
-    return checkCanProvideStats(table) == true ? 
StatsSetupConst.areColumnStatsUptoDate(params, colName) : false;
+    return checkCanProvideStats(table) && 
StatsSetupConst.areColumnStatsUptoDate(params, colName);
   }
 
   /**

(hive) branch master updated: HIVE-28268: Iceberg: Retrieve row count from iceberg SnapshotSummary in case of iceberg.hive.keep.stats=false (Butao Zhang, reviewed by Denys Kuzmenko)

Reply via email to