This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new b431e11eb19 HIVE-28268: Iceberg: Retrieve row count from iceberg
SnapshotSummary in case of iceberg.hive.keep.stats=false (Butao Zhang, reviewed
by Denys Kuzmenko)
b431e11eb19 is described below
commit b431e11eb19def7df978547bd161ba102d59083c
Author: Butao Zhang <[email protected]>
AuthorDate: Fri Oct 25 22:49:37 2024 +0800
HIVE-28268: Iceberg: Retrieve row count from iceberg SnapshotSummary in
case of iceberg.hive.keep.stats=false (Butao Zhang, reviewed by Denys Kuzmenko)
Closes #5215
---
.../iceberg/mr/hive/HiveIcebergStorageHandler.java | 60 ++++----
.../apache/iceberg/mr/hive/IcebergTableUtil.java | 35 +++++
.../src/test/queries/positive/iceberg_stats.q | 22 +++
.../src/test/results/positive/iceberg_stats.q.out | 159 +++++++++++++++++++++
.../results/positive/write_iceberg_branch.q.out | 50 +++----
.../apache/hadoop/hive/ql/metadata/HiveUtils.java | 7 +-
.../hadoop/hive/ql/optimizer/StatsOptimizer.java | 11 +-
.../apache/hadoop/hive/ql/stats/StatsUtils.java | 9 +-
8 files changed, 287 insertions(+), 66 deletions(-)
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index b70bd15179f..fc7a9df77d2 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -464,41 +464,44 @@ public class HiveIcebergStorageHandler implements
HiveStoragePredicateHandler, H
public Map<String, String> getBasicStatistics(Partish partish) {
org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
// For write queries where rows got modified, don't fetch from cache as
values could have changed.
- Table table = getTable(hmsTable);
Map<String, String> stats = Maps.newHashMap();
- if (getStatsSource().equals(HiveMetaHook.ICEBERG)) {
- if (table.currentSnapshot() != null) {
- Map<String, String> summary = table.currentSnapshot().summary();
- if (summary != null) {
+ if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) {
+ return hmsTable.getParameters();
+ }
+ Table table = getTable(hmsTable);
- if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
- stats.put(StatsSetupConst.NUM_FILES,
summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
- }
+ Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table);
+ if (snapshot != null) {
+ Map<String, String> summary = snapshot.summary();
+ if (summary != null) {
+
+ if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
+ stats.put(StatsSetupConst.NUM_FILES,
summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
+ }
- if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
- long totalRecords =
Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
- if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
- summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
+ if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
+ long totalRecords =
Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
+ if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
+ summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
- long totalEqDeletes =
Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
- long totalPosDeletes =
Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
+ long totalEqDeletes =
Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
+ long totalPosDeletes =
Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
- long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 :
totalPosDeletes);
- totalRecords = actualRecords > 0 ? actualRecords : totalRecords;
- // actualRecords maybe -ve in edge cases
- }
- stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords));
+ long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 :
totalPosDeletes);
+ totalRecords = actualRecords > 0 ? actualRecords : totalRecords;
+ // actualRecords maybe -ve in edge cases
}
+ stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords));
+ }
- if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
- stats.put(StatsSetupConst.TOTAL_SIZE,
summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
- }
+ if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
+ stats.put(StatsSetupConst.TOTAL_SIZE,
summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
}
- } else {
- stats.put(StatsSetupConst.NUM_FILES, "0");
- stats.put(StatsSetupConst.ROW_COUNT, "0");
- stats.put(StatsSetupConst.TOTAL_SIZE, "0");
}
+ } else {
+ stats.put(StatsSetupConst.NUM_FILES, "0");
+ stats.put(StatsSetupConst.ROW_COUNT, "0");
+ stats.put(StatsSetupConst.TOTAL_SIZE, "0");
}
return stats;
}
@@ -613,8 +616,9 @@ public class HiveIcebergStorageHandler implements
HiveStoragePredicateHandler, H
public boolean
canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
if (getStatsSource().equals(HiveMetaHook.ICEBERG) &&
hmsTable.getMetaTable() == null) {
Table table = getTable(hmsTable);
- if (table.currentSnapshot() != null) {
- Map<String, String> summary = table.currentSnapshot().summary();
+ Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table);
+ if (snapshot != null) {
+ Map<String, String> summary = snapshot.summary();
if (summary != null &&
summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 773ae553b2b..d6ddf637a90 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -20,6 +20,7 @@
package org.apache.iceberg.mr.hive;
import java.io.IOException;
+import java.time.ZoneId;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -31,6 +32,9 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.TimestampTZ;
+import org.apache.hadoop.hive.common.type.TimestampTZUtil;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
@@ -42,6 +46,7 @@ import org.apache.hadoop.hive.ql.parse.AlterTableExecuteSpec;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.TransformSpec;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionStateUtil;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DeleteFile;
@@ -59,6 +64,7 @@ import org.apache.iceberg.PositionDeletesScanTask;
import org.apache.iceberg.RowLevelOperationMode;
import org.apache.iceberg.ScanTask;
import org.apache.iceberg.Schema;
+import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotRef;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
@@ -79,6 +85,7 @@ import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.PropertyUtil;
+import org.apache.iceberg.util.SnapshotUtil;
import org.apache.iceberg.util.StructProjection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -567,4 +574,32 @@ public class IcebergTableUtil {
throw new SemanticException(String.format("Error while fetching the
partitions due to: %s", e));
}
}
+
+ public static Snapshot
getTableSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table
table) {
+ String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef());
+ Snapshot snapshot;
+ if (refName != null) {
+ snapshot = table.snapshot(refName);
+ } else if (hmsTable.getAsOfTimestamp() != null) {
+ ZoneId timeZone = SessionState.get() == null ? new
HiveConf().getLocalTimeZone() :
+ SessionState.get().getConf().getLocalTimeZone();
+ TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(),
timeZone);
+ long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table,
time.toEpochMilli());
+ snapshot = table.snapshot(snapshotId);
+ } else if (hmsTable.getAsOfVersion() != null) {
+ try {
+ snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion()));
+ } catch (NumberFormatException e) {
+ SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion());
+ if (ref == null) {
+ throw new RuntimeException("Cannot find matching snapshot ID or
reference name for version " +
+ hmsTable.getAsOfVersion());
+ }
+ snapshot = table.snapshot(ref.snapshotId());
+ }
+ } else {
+ snapshot = table.currentSnapshot();
+ }
+ return snapshot;
+ }
}
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
index de88018f32e..6fc965e1745 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
@@ -28,5 +28,27 @@ select count(*) from ice01;
insert overwrite table ice01 select * from ice01;
explain select count(*) from ice01;
+-- false means that count(*) query won't use row count stored in HMS
+set iceberg.hive.keep.stats=false;
+
+create external table ice03 (id int, key int) Stored by Iceberg stored as ORC
+ TBLPROPERTIES('format-version'='2');
+
+insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1);
+-- Iceberg table can utilize fetch task to directly retrieve the row count
from iceberg SnapshotSummary
+explain select count(*) from ice03;
+select count(*) from ice03;
+
+-- delete some values
+delete from ice03 where id in (2,4);
+
+explain select count(*) from ice03;
+select count(*) from ice03;
+
+-- iow
+insert overwrite table ice03 select * from ice03;
+explain select count(*) from ice03;
+
drop table ice01;
drop table ice02;
+drop table ice03;
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
index 33c60b54608..4e5b7094501 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
@@ -192,6 +192,155 @@ STAGE PLANS:
Processor Tree:
ListSink
+PREHOOK: query: create external table ice03 (id int, key int) Stored by
Iceberg stored as ORC
+ TBLPROPERTIES('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice03
+POSTHOOK: query: create external table ice03 (id int, key int) Stored by
Iceberg stored as ORC
+ TBLPROPERTIES('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice03
+PREHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice03
+POSTHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: 1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5
+PREHOOK: query: delete from ice03 where id in (2,4)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: default@ice03
+POSTHOOK: query: delete from ice03 where id in (2,4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: ice03
+ Statistics: Num rows: 3 Data size: #Masked# Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 3 Data size: #Masked# Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count()
+ minReductionHashAggr: 0.6666666
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ null sort order:
+ sort order:
+ Statistics: Num rows: 1 Data size: #Masked# Basic
stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Execution mode: vectorized
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: insert overwrite table ice03 select * from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: default@ice03
+POSTHOOK: query: insert overwrite table ice03 select * from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: 1
+ Processor Tree:
+ ListSink
+
PREHOOK: query: drop table ice01
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice01
@@ -212,3 +361,13 @@ POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice02
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice02
+PREHOOK: query: drop table ice03
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@ice03
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice03
+POSTHOOK: query: drop table ice03
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice03
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
index bc662c426a4..cbf1e0562d1 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
@@ -234,17 +234,17 @@ STAGE PLANS:
alias: ice01
filterExpr: (a = 22) (type: boolean)
Snapshot ref: branch_test1
- Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 485 Basic stats: COMPLETE
Column stats: COMPLETE
Filter Operator
predicate: (a = 22) (type: boolean)
- Statistics: Num rows: 2 Data size: 194 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 291 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: PARTITION__SPEC__ID (type: int),
PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type:
bigint), PARTITION__PROJECTION (type: string), 22 (type: int), b (type:
string), c (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7
- Statistics: Num rows: 2 Data size: 970 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 1455 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 2 Data size: 970 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 1455 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
output format:
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -316,7 +316,7 @@ STAGE PLANS:
alias: ice01
filterExpr: (c = 66) (type: boolean)
Snapshot ref: branch_test1
- Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE
Column stats: COMPLETE
Filter Operator
predicate: (c = 66) (type: boolean)
Statistics: Num rows: 2 Data size: 194 Basic stats:
COMPLETE Column stats: COMPLETE
@@ -451,20 +451,20 @@ STAGE PLANS:
alias: ice01
filterExpr: a is not null (type: boolean)
Snapshot ref: branch_test1
- Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE
Column stats: COMPLETE
Filter Operator
predicate: a is not null (type: boolean)
- Statistics: Num rows: 3 Data size: 291 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 388 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: PARTITION__SPEC__ID (type: int),
PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type:
bigint), PARTITION__PROJECTION (type: string), a (type: int), b (type: string),
c (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7
- Statistics: Num rows: 3 Data size: 1455 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1940 Basic stats:
COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col5 (type: int)
null sort order: z
sort order: +
Map-reduce partition columns: _col5 (type: int)
- Statistics: Num rows: 3 Data size: 1455 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1940 Basic stats:
COMPLETE Column stats: COMPLETE
value expressions: _col0 (type: int), _col1 (type:
bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string),
_col6 (type: string), _col7 (type: int)
Execution mode: vectorized
Reducer 2
@@ -476,11 +476,11 @@ STAGE PLANS:
0 _col0 (type: int)
1 _col5 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9, _col10
- Statistics: Num rows: 3 Data size: 1743 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 2324 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
expressions: _col1 (type: string), _col0 (type: int), _col5
(type: string), _col7 (type: string), _col2 (type: int), _col6 (type: bigint),
_col4 (type: bigint), _col3 (type: int), _col10 (type: int), _col9 (type:
string), _col8 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9, _col10
- Statistics: Num rows: 3 Data size: 1743 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 2324 Basic stats:
COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((_col10 = _col1) and (_col10 > 100)) (type:
boolean)
Statistics: Num rows: 1 Data size: 581 Basic stats:
COMPLETE Column stats: COMPLETE
@@ -498,14 +498,14 @@ STAGE PLANS:
name: default.ice01
Filter Operator
predicate: ((_col10 = _col1) and (_col10 <= 100)) (type:
boolean)
- Statistics: Num rows: 1 Data size: 581 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 1162 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col7 (type: int), _col6 (type: bigint),
_col2 (type: string), _col5 (type: bigint), _col3 (type: string), _col10 (type:
int), _col9 (type: string), _col8 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7
- Statistics: Num rows: 1 Data size: 485 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 970 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 485 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 970 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
output format:
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -513,14 +513,14 @@ STAGE PLANS:
name: default.ice01
Filter Operator
predicate: ((_col10 = _col1) and (_col10 <= 100)) (type:
boolean)
- Statistics: Num rows: 1 Data size: 581 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 1162 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col10 (type: int), 'Merged' (type:
string), (_col8 + 10) (type: int)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 98 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 196 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 1 Data size: 98 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 196 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
output format:
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -543,24 +543,24 @@ STAGE PLANS:
name: default.ice01
Filter Operator
predicate: (_col10 = _col1) (type: boolean)
- Statistics: Num rows: 1 Data size: 581 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 1162 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col2 (type: string), _col5 (type: bigint),
_col6 (type: bigint), _col7 (type: int)
outputColumnNames: _col2, _col5, _col6, _col7
- Statistics: Num rows: 1 Data size: 581 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 1162 Basic stats:
COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
keys: _col7 (type: int), _col6 (type: bigint), _col2
(type: string), _col5 (type: bigint)
minReductionHashAggr: 0.4
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 212 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 424 Basic stats:
COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: int), _col1 (type:
bigint), _col2 (type: string), _col3 (type: bigint)
null sort order: zzzz
sort order: ++++
Map-reduce partition columns: _col0 (type: int),
_col1 (type: bigint), _col2 (type: string), _col3 (type: bigint)
- Statistics: Num rows: 1 Data size: 212 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 424 Basic stats:
COMPLETE Column stats: COMPLETE
value expressions: _col4 (type: bigint)
Reducer 3
Execution mode: vectorized
@@ -570,7 +570,7 @@ STAGE PLANS:
keys: KEY._col0 (type: int), KEY._col1 (type: bigint),
KEY._col2 (type: string), KEY._col3 (type: bigint)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE
Column stats: COMPLETE
Filter Operator
predicate: (_col4 > 1L) (type: boolean)
Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE
Column stats: COMPLETE
@@ -795,14 +795,14 @@ STAGE PLANS:
TableScan
alias: ice01
Snapshot ref: branch_test1
- Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
expressions: a (type: int), b (type: string), c (type: int)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 3 Data size: 285 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 95 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 3 Data size: 285 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 95 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
index 5343a1bb3bb..e4076d11367 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
@@ -518,11 +518,8 @@ public final class HiveUtils {
}
public static String getTableSnapshotRef(String refName) {
- Matcher ref = SNAPSHOT_REF.matcher(refName);
- if (ref.matches()) {
- return ref.group(1);
- }
- return null;
+ Matcher ref = SNAPSHOT_REF.matcher(String.valueOf(refName));
+ return ref.matches() ? ref.group(1) : null;
}
public static Boolean isTableTag(String refName) {
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 38d67660c63..1de37c42105 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.stats.Partish;
import org.apache.hadoop.hive.ql.stats.StatsUtils;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
@@ -942,10 +943,16 @@ public class StatsOptimizer extends Transform {
rowCnt += partRowCnt;
}
} else { // unpartitioned table
- if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl,
tbl.getParameters())) {
+ Map<String, String> basicStats = tbl.getParameters();
+ if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())) {
+ if (!tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
+ return null;
+ }
+ basicStats =
tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl));
+ } else if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl,
tbl.getParameters())) {
return null;
}
- rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+ rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT));
}
return rowCnt;
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 239f57b69b3..81fde429cb3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -2036,10 +2036,7 @@ public class StatsUtils {
public static boolean checkCanProvideStats(Table table) {
if (MetaStoreUtils.isExternalTable(table.getTTable())) {
- if (MetaStoreUtils.isNonNativeTable(table.getTTable()) &&
table.getStorageHandler().canProvideBasicStatistics()) {
- return true;
- }
- return false;
+ return MetaStoreUtils.isNonNativeTable(table.getTTable()) &&
table.getStorageHandler().canProvideBasicStatistics();
}
return true;
}
@@ -2049,7 +2046,7 @@ public class StatsUtils {
* Can run additional checks compared to the version in StatsSetupConst.
*/
public static boolean areBasicStatsUptoDateForQueryAnswering(Table table,
Map<String, String> params) {
- return checkCanProvideStats(table) == true ?
StatsSetupConst.areBasicStatsUptoDate(params) : false;
+ return checkCanProvideStats(table) &&
StatsSetupConst.areBasicStatsUptoDate(params);
}
/**
@@ -2057,7 +2054,7 @@ public class StatsUtils {
* Can run additional checks compared to the version in StatsSetupConst.
*/
public static boolean areColumnStatsUptoDateForQueryAnswering(Table table,
Map<String, String> params, String colName) {
- return checkCanProvideStats(table) == true ?
StatsSetupConst.areColumnStatsUptoDate(params, colName) : false;
+ return checkCanProvideStats(table) &&
StatsSetupConst.areColumnStatsUptoDate(params, colName);
}
/**