This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 2c18ef2d5dd HIVE-29147: Iceberg: Table-level column stats filter
support (#5724)
2c18ef2d5dd is described below
commit 2c18ef2d5dde2a0c682eeb6dc9b7473b3b94386e
Author: Denys Kuzmenko <[email protected]>
AuthorDate: Fri Sep 5 16:30:11 2025 +0200
HIVE-29147: Iceberg: Table-level column stats filter support (#5724)
---
.../iceberg/mr/hive/HiveIcebergStorageHandler.java | 180 ++++++++++-----------
.../apache/iceberg/mr/hive/IcebergTableUtil.java | 48 +++++-
.../iceberg/mr/hive/TestHiveIcebergStatistics.java | 2 +-
...berg_major_compaction_partition_evolution.q.out | 28 ++--
.../org/apache/hadoop/hive/ql/metadata/Hive.java | 2 +-
.../hive/ql/metadata/HiveStorageHandler.java | 66 ++++++--
6 files changed, 198 insertions(+), 128 deletions(-)
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index 88aa9c44991..337bb79e35d 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -20,6 +20,7 @@
package org.apache.iceberg.mr.hive;
import java.io.IOException;
+import java.io.Serializable;
import java.io.UncheckedIOException;
import java.net.URI;
import java.net.URISyntaxException;
@@ -27,7 +28,6 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
-import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
@@ -37,6 +37,7 @@
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
+import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.collections.MapUtils;
@@ -45,6 +46,7 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
@@ -89,7 +91,6 @@
import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler;
import org.apache.hadoop.hive.ql.metadata.DummyPartition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.metadata.Partition;
@@ -183,7 +184,6 @@
import org.apache.iceberg.puffin.BlobMetadata;
import org.apache.iceberg.puffin.Puffin;
import org.apache.iceberg.puffin.PuffinCompressionCodec;
-import org.apache.iceberg.puffin.PuffinReader;
import org.apache.iceberg.puffin.PuffinWriter;
import
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
@@ -196,7 +196,6 @@
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Types;
-import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.SerializationUtil;
import org.apache.iceberg.util.SnapshotUtil;
@@ -221,12 +220,14 @@ public class HiveIcebergStorageHandler extends
DefaultStorageHandler implements
private static final String ICEBERG_URI_PREFIX = "iceberg://";
private static final String TABLE_NAME_SEPARATOR = "..";
- // Column index for partition metadata table
- public static final String COPY_ON_WRITE =
RowLevelOperationMode.COPY_ON_WRITE.modeName();
- public static final String MERGE_ON_READ =
RowLevelOperationMode.MERGE_ON_READ.modeName();
+ public static final String TABLE_DEFAULT_LOCATION = "TABLE_DEFAULT_LOCATION";
+
+ private static final String SPEC_ID = "spec-id";
+ private static final String PARTITION = "partition";
public static final String STATS = "/stats/snap-";
- public static final String TABLE_DEFAULT_LOCATION = "TABLE_DEFAULT_LOCATION";
+ public static final String COPY_ON_WRITE =
RowLevelOperationMode.COPY_ON_WRITE.modeName();
+ public static final String MERGE_ON_READ =
RowLevelOperationMode.MERGE_ON_READ.modeName();
private static final List<VirtualColumn> ACID_VIRTUAL_COLS =
ImmutableList.of(
PARTITION_SPEC_ID, PARTITION_HASH, FILE_PATH, ROW_POSITION,
PARTITION_PROJECTION);
@@ -601,20 +602,27 @@ private boolean writeColStats(List<ColumnStatistics>
colStats, Table tbl) {
long snapshotId = tbl.currentSnapshot().snapshotId();
long snapshotSequenceNumber = tbl.currentSnapshot().sequenceNumber();
- colStats.forEach(statsObj -> {
- byte[] serializeColStats = SerializationUtils.serialize(statsObj);
- puffinWriter.add(
- new Blob(
- ColumnStatisticsObj.class.getSimpleName(),
- ImmutableList.of(1),
- snapshotId,
- snapshotSequenceNumber,
- ByteBuffer.wrap(serializeColStats),
- PuffinCompressionCodec.NONE,
- ImmutableMap.of("partition",
- String.valueOf(statsObj.getStatsDesc().getPartName()))
- ));
+ colStats.forEach(stats -> {
+ boolean isTblLevel = stats.getStatsDesc().isIsTblLevel();
+
+ for (Serializable statsObj : isTblLevel ? stats.getStatsObj() :
Collections.singletonList(stats)) {
+ byte[] serializeColStats = SerializationUtils.serialize(statsObj);
+ puffinWriter.add(
+ new Blob(
+ ColumnStatisticsObj.class.getSimpleName(),
+ ImmutableList.of(isTblLevel ? tbl.spec().schema().findField(
+ ((ColumnStatisticsObj) statsObj).getColName()).fieldId() :
1),
+ snapshotId,
+ snapshotSequenceNumber,
+ ByteBuffer.wrap(serializeColStats),
+ PuffinCompressionCodec.NONE,
+ isTblLevel ?
+ ImmutableMap.of(SPEC_ID,
String.valueOf(tbl.spec().specId())) :
+ ImmutableMap.of(PARTITION,
String.valueOf(stats.getStatsDesc().getPartName()))
+ ));
+ }
});
+
puffinWriter.finish();
statisticsFile =
@@ -628,7 +636,13 @@ private boolean writeColStats(List<ColumnStatistics>
colStats, Table tbl) {
.collect(ImmutableList.toImmutableList())
);
} catch (IOException e) {
- LOG.warn("Unable to write stats to puffin file {}", e.getMessage());
+ LOG.warn("Unable to write column stats to the Puffin file: {}",
e.getMessage());
+
+ Path path = new Path(statsPath);
+ FileSystem fs = path.getFileSystem(conf);
+ if (fs.exists(path)) {
+ fs.delete(path, false);
+ }
return false;
}
tbl.updateStatistics()
@@ -637,7 +651,7 @@ private boolean writeColStats(List<ColumnStatistics>
colStats, Table tbl) {
return true;
} catch (Exception e) {
- LOG.warn("Unable to invalidate or merge stats: {}", e.getMessage());
+ LOG.warn("Unable to invalidate or merge column stats: {}",
e.getMessage());
}
return false;
}
@@ -653,21 +667,32 @@ public boolean
canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table
}
private boolean canProvideColStats(Table table, long snapshotId) {
- return IcebergTableUtil.getColStatsPath(table, snapshotId).isPresent();
+ return IcebergTableUtil.getColStatsPath(table, snapshotId) != null;
}
@Override
- public List<ColumnStatisticsObj>
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ public List<ColumnStatisticsObj>
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable,
+ List<String> colNames) {
Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+
Snapshot snapshot = IcebergTableUtil.getTableSnapshot(table, hmsTable);
+ if (snapshot == null) {
+ return Lists.newArrayList();
+ }
- ColumnStatistics emptyStats = new ColumnStatistics();
- if (snapshot != null) {
- return IcebergTableUtil.getColStatsPath(table, snapshot.snapshotId())
- .map(statsPath -> readColStats(table, statsPath, null).getFirst())
- .orElse(emptyStats).getStatsObj();
+ Predicate<BlobMetadata> filter;
+ if (colNames != null) {
+ Set<String> columns = Sets.newHashSet(colNames);
+ filter = metadata -> {
+ int specId = Integer.parseInt(metadata.properties().get(SPEC_ID));
+ String column =
table.specs().get(specId).schema().findColumnName(metadata.inputFields().getFirst());
+ return columns.contains(column);
+ };
+ } else {
+ filter = null;
}
- return emptyStats.getStatsObj();
+
+ return IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter);
}
@Override
@@ -684,9 +709,10 @@ public AggrStats
getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hms
MetastoreConf.ConfVars.STATS_NDV_DENSITY_FUNCTION);
double ndvTuner = MetastoreConf.getDoubleVar(getConf(),
MetastoreConf.ConfVars.STATS_NDV_TUNER);
- List<ColumnStatistics> partStats = IcebergTableUtil.getColStatsPath(table,
snapshot.snapshotId())
- .map(statsPath -> readColStats(table, statsPath,
Sets.newHashSet(partNames)))
- .orElse(Collections.emptyList());
+ Set<String> partitions = Sets.newHashSet(partNames);
+ Predicate<BlobMetadata> filter = metadata ->
partitions.contains(metadata.properties().get(PARTITION));
+
+ List<ColumnStatistics> partStats = IcebergTableUtil.readColStats(table,
snapshot.snapshotId(), filter);
partStats.forEach(colStats ->
colStats.getStatsObj().removeIf(statsObj ->
!colNames.contains(statsObj.getColName())));
@@ -700,30 +726,6 @@ public AggrStats
getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hms
return new AggrStats(colStatsList, partStats.size());
}
- private List<ColumnStatistics> readColStats(Table table, Path statsPath,
Set<String> partNames) {
- List<ColumnStatistics> colStats = Lists.newArrayList();
-
- try (PuffinReader reader =
Puffin.read(table.io().newInputFile(statsPath.toString())).build()) {
- List<BlobMetadata> blobMetadata = reader.fileMetadata().blobs();
-
- if (partNames != null) {
- blobMetadata = blobMetadata.stream()
- .filter(metadata ->
partNames.contains(metadata.properties().get("partition")))
- .collect(Collectors.toList());
- }
- Iterator<ByteBuffer> it =
Iterables.transform(reader.readAll(blobMetadata), Pair::second).iterator();
- LOG.info("Using col stats from : {}", statsPath);
-
- while (it.hasNext()) {
- byte[] byteBuffer = ByteBuffers.toByteArray(it.next());
- colStats.add(SerializationUtils.deserialize(byteBuffer));
- }
- } catch (Exception e) {
- LOG.warn(" Unable to read col stats: ", e);
- }
- return colStats;
- }
-
@Override
public boolean canComputeQueryUsingStats(Partish partish) {
org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
@@ -757,28 +759,30 @@ private String getStatsSource() {
private boolean shouldRewriteColStats(Table tbl) {
return
SessionStateUtil.getQueryState(conf).map(QueryState::getHiveOperation)
.filter(opType -> HiveOperation.ANALYZE_TABLE ==
opType).isPresent() ||
- IcebergTableUtil.getColStatsPath(tbl).isPresent();
+ IcebergTableUtil.getColStatsPath(tbl) != null;
}
private void checkAndMergeColStats(List<ColumnStatistics> statsNew, Table
tbl) throws InvalidObjectException {
Long previousSnapshotId = tbl.currentSnapshot().parentId();
if (previousSnapshotId != null && canProvideColStats(tbl,
previousSnapshotId)) {
- List<ColumnStatistics> statsOld = IcebergTableUtil.getColStatsPath(tbl,
previousSnapshotId)
- .map(statsPath -> readColStats(tbl, statsPath, null))
- .orElse(Collections.emptyList());
boolean isTblLevel = statsNew.getFirst().getStatsDesc().isIsTblLevel();
Map<String, ColumnStatistics> oldStatsMap = Maps.newHashMap();
+ List<?> statsOld = IcebergTableUtil.readColStats(tbl,
previousSnapshotId, null);
+
if (!isTblLevel) {
- for (ColumnStatistics statsObjOld : statsOld) {
+ for (ColumnStatistics statsObjOld : (List<ColumnStatistics>) statsOld)
{
oldStatsMap.put(statsObjOld.getStatsDesc().getPartName(),
statsObjOld);
}
+ } else {
+ statsOld = Collections.singletonList(
+ new ColumnStatistics(null, (List<ColumnStatisticsObj>) statsOld));
}
for (ColumnStatistics statsObjNew : statsNew) {
String partitionKey = statsObjNew.getStatsDesc().getPartName();
ColumnStatistics statsObjOld = isTblLevel ?
- statsOld.getFirst() : oldStatsMap.get(partitionKey);
+ (ColumnStatistics) statsOld.getFirst() :
oldStatsMap.get(partitionKey);
if (statsObjOld != null && statsObjOld.getStatsObjSize() != 0 &&
!statsObjNew.getStatsObj().isEmpty()) {
MetaStoreServerUtils.mergeColStats(statsObjNew, statsObjOld);
@@ -1864,19 +1868,6 @@ public void addResourcesForCreateTable(Map<String,
String> tblProps, HiveConf hi
}
}
- /**
- * Check the operation type of all snapshots which are newer than the
specified. The specified snapshot is excluded.
- * @param hmsTable table metadata stored in Hive Metastore
- * @param since the snapshot preceding the oldest snapshot which should be
checked.
- * The value null means all should be checked.
- * @return null if table is empty, true if all snapshots are {@link
SnapshotContext.WriteOperationType#APPEND}s,
- * false otherwise.
- *
- * @deprecated
- * <br>Use {@link HiveStorageHandler#getSnapshotContexts(
- * org.apache.hadoop.hive.ql.metadata.Table hmsTable, SnapshotContext since)}
- * and check {@link SnapshotContext.WriteOperationType#APPEND}.equals({@link
SnapshotContext#getOperation()}).
- */
@Deprecated
@Override
public Boolean hasAppendsOnly(org.apache.hadoop.hive.ql.metadata.Table
hmsTable, SnapshotContext since) {
@@ -2132,23 +2123,24 @@ public List<Partition>
getPartitionsByExpr(org.apache.hadoop.hive.ql.metadata.Ta
.caseSensitive(false).includeColumnStats().ignoreResiduals();
try (CloseableIterable<FileScanTask> tasks = scan.planFiles()) {
- FluentIterable.from(tasks).filter(task ->
task.spec().isPartitioned()).forEach(task -> {
- DataFile file = task.file();
- PartitionSpec spec = task.spec();
-
- if (latestSpecOnly == null || latestSpecOnly && file.specId() ==
tableSpecId ||
- !latestSpecOnly && file.specId() != tableSpecId) {
-
- PartitionData partitionData =
IcebergTableUtil.toPartitionData(task.partition(), spec.partitionType());
- String partName = spec.partitionToPath(partitionData);
-
- Map<String, String> partSpecMap = Maps.newLinkedHashMap();
- Warehouse.makeSpecFromName(partSpecMap, new Path(partName), null);
-
- DummyPartition partition = new DummyPartition(hmsTable, partName,
partSpecMap);
- partitions.add(partition);
- }
- });
+ FluentIterable.from(tasks)
+ .filter(task -> task.spec().isPartitioned())
+ .forEach(task -> {
+ DataFile file = task.file();
+ PartitionSpec spec = task.spec();
+
+ if (latestSpecOnly == null || latestSpecOnly && file.specId() ==
tableSpecId ||
+ !latestSpecOnly && file.specId() != tableSpecId) {
+ PartitionData partitionData =
IcebergTableUtil.toPartitionData(task.partition(), spec.partitionType());
+ String partName = spec.partitionToPath(partitionData);
+
+ Map<String, String> partSpecMap = Maps.newLinkedHashMap();
+ Warehouse.makeSpecFromName(partSpecMap, new Path(partName),
null);
+
+ DummyPartition partition = new DummyPartition(hmsTable,
partName, partSpecMap);
+ partitions.add(partition);
+ }
+ });
} catch (IOException e) {
throw new SemanticException(String.format("Error while fetching the
partitions due to: %s", e));
}
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 098928bb204..3fe8b3838a2 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -20,10 +20,12 @@
package org.apache.iceberg.mr.hive;
import java.io.IOException;
+import java.nio.ByteBuffer;
import java.time.ZoneId;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -34,7 +36,9 @@
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BinaryOperator;
import java.util.function.Function;
+import java.util.function.Predicate;
import java.util.stream.Collectors;
+import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
@@ -79,6 +83,7 @@
import org.apache.iceberg.Schema;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotRef;
+import org.apache.iceberg.StatisticsFile;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
@@ -91,12 +96,18 @@
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
+import org.apache.iceberg.puffin.BlobMetadata;
+import org.apache.iceberg.puffin.Puffin;
+import org.apache.iceberg.puffin.PuffinReader;
import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ByteBuffers;
+import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SnapshotUtil;
import org.apache.iceberg.util.StructProjection;
@@ -223,18 +234,18 @@ static Snapshot getTableSnapshot(Table table, String
snapshotRef) {
return table.currentSnapshot();
}
- static Optional<Path> getColStatsPath(Table table) {
+ static String getColStatsPath(Table table) {
return getColStatsPath(table, table.currentSnapshot().snapshotId());
}
- static Optional<Path> getColStatsPath(Table table, long snapshotId) {
+ static String getColStatsPath(Table table, long snapshotId) {
return table.statisticsFiles().stream()
.filter(stats -> stats.snapshotId() == snapshotId)
.filter(stats -> stats.blobMetadata().stream()
.anyMatch(metadata ->
ColumnStatisticsObj.class.getSimpleName().equals(metadata.type()))
)
- .map(stats -> new Path(stats.path()))
- .findAny();
+ .map(StatisticsFile::path)
+ .findAny().orElse(null);
}
static PartitionStatisticsFile getPartitionStatsFile(Table table, long
snapshotId) {
@@ -590,6 +601,33 @@ public static TransformSpec getTransformSpec(Table table,
String transformName,
return spec;
}
+ public static <T> List<T> readColStats(Table table, Long snapshotId,
Predicate<BlobMetadata> filter) {
+ List<T> colStats = Lists.newArrayList();
+
+ String statsPath = IcebergTableUtil.getColStatsPath(table, snapshotId);
+ if (statsPath == null) {
+ return colStats;
+ }
+ try (PuffinReader reader =
Puffin.read(table.io().newInputFile(statsPath)).build()) {
+ List<BlobMetadata> blobMetadata = reader.fileMetadata().blobs();
+
+ if (filter != null) {
+ blobMetadata = blobMetadata.stream().filter(filter)
+ .toList();
+ }
+ Iterator<ByteBuffer> it =
Iterables.transform(reader.readAll(blobMetadata), Pair::second).iterator();
+ LOG.info("Using column stats from: {}", statsPath);
+
+ while (it.hasNext()) {
+ byte[] byteBuffer = ByteBuffers.toByteArray(it.next());
+ colStats.add(SerializationUtils.deserialize(byteBuffer));
+ }
+ } catch (Exception e) {
+ LOG.warn("Unable to read column stats: {}", e.getMessage());
+ }
+ return colStats;
+ }
+
public static ExecutorService newDeleteThreadPool(String completeName, int
numThreads) {
AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
return Executors.newFixedThreadPool(numThreads, runnable -> {
@@ -608,7 +646,7 @@ public static boolean hasUndergonePartitionEvolution(Table
table) {
.anyMatch(id -> id != table.spec().specId());
}
- public static <T extends ContentFile> Set<String> getPartitionNames(Table
icebergTable, Iterable<T> files,
+ public static <T extends ContentFile<?>> Set<String> getPartitionNames(Table
icebergTable, Iterable<T> files,
Boolean latestSpecOnly) {
Set<String> partitions = Sets.newHashSet();
int tableSpecId = icebergTable.spec().specId();
diff --git
a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
index 832676df55b..7031f323d29 100644
---
a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
+++
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
@@ -345,7 +345,7 @@ public void testIcebergColStatsPath() throws IOException {
table.refresh();
- Path tblColPath = IcebergTableUtil.getColStatsPath(table).orElse(null);
+ Path tblColPath = new Path(IcebergTableUtil.getColStatsPath(table));
Assert.assertNotNull(tblColPath);
// Check that if colPath is created correctly
Assert.assertTrue(tblColPath.getFileSystem(shell.getHiveConf()).exists(tblColPath));
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
index 10dffc77d9b..b7494bd383b 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
@@ -380,14 +380,14 @@ STAGE PLANS:
TableScan
alias: ice_orc
Snapshot ref: tag_v1
- Statistics: Num rows: 4 Data size: 768 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
Select Operator
expressions: first_name (type: string), last_name (type:
string), dept_id (type: bigint), team_id (type: bigint), company_id (type:
bigint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 4 Data size: 768 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 4 Data size: 768 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -423,14 +423,14 @@ STAGE PLANS:
TableScan
alias: ice_orc
Snapshot ref: tag_v2
- Statistics: Num rows: 8 Data size: 1536 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
Select Operator
expressions: first_name (type: string), last_name (type:
string), dept_id (type: bigint), team_id (type: bigint), company_id (type:
bigint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 8 Data size: 1536 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 8 Data size: 1536 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -595,17 +595,17 @@ STAGE PLANS:
alias: ice_orc
filterExpr: company_id is not null (type: boolean)
Snapshot ref: tag_v1
- Statistics: Num rows: 4 Data size: 768 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
Filter Operator
predicate: company_id is not null (type: boolean)
- Statistics: Num rows: 3 Data size: 576 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
Select Operator
expressions: first_name (type: string), last_name (type:
string), dept_id (type: bigint), team_id (type: bigint), company_id (type:
bigint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 3 Data size: 576 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 3 Data size: 576 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 4 Data size: 1568 Basic stats:
COMPLETE Column stats: NONE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -642,17 +642,17 @@ STAGE PLANS:
alias: ice_orc
filterExpr: company_id is not null (type: boolean)
Snapshot ref: tag_v2
- Statistics: Num rows: 8 Data size: 1536 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
Filter Operator
predicate: company_id is not null (type: boolean)
- Statistics: Num rows: 7 Data size: 1344 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
Select Operator
expressions: first_name (type: string), last_name (type:
string), dept_id (type: bigint), team_id (type: bigint), company_id (type:
bigint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 7 Data size: 1344 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
File Output Operator
compressed: false
- Statistics: Num rows: 7 Data size: 1344 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 3136 Basic stats:
COMPLETE Column stats: NONE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index 82964ddb3c1..6a332a10f19 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -6181,7 +6181,7 @@ public List<ColumnStatisticsObj> getTableColumnStatistics(
List<ColumnStatisticsObj> retv = null;
try {
if (tbl.isNonNative() &&
tbl.getStorageHandler().canProvideColStatistics(tbl)) {
- return tbl.getStorageHandler().getColStatistics(tbl);
+ return tbl.getStorageHandler().getColStatistics(tbl, colNames);
}
if (checkTransactional) {
AcidUtils.TableSnapshot tableSnapshot =
AcidUtils.getTableSnapshot(conf, tbl);
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index 5f29f1bb244..dda0873b6a2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -182,16 +182,14 @@ public interface HiveStorageHandler extends Configurable {
void configureOutputJobProperties(TableDesc tableDesc, Map<String, String>
jobProperties);
/**
- * Deprecated use configureInputJobProperties/configureOutputJobProperties
- * methods instead.
- *
* Configures properties for a job based on the definition of the
* source or target table it accesses.
*
* @param tableDesc descriptor for the table being accessed
+ * @param jobProperties receives properties copied or transformed from the
table properties
*
- * @param jobProperties receives properties copied or transformed
- * from the table properties
+ * @deprecated since 4.0.1, will be removed in 5.0.0,
+ * use {@link #configureInputJobProperties} and {@link
#configureOutputJobProperties} instead.
*/
@Deprecated
void configureTableJobProperties(TableDesc tableDesc, Map<String, String>
jobProperties);
@@ -286,12 +284,26 @@ default boolean
canProvidePartitionStatistics(org.apache.hadoop.hive.ql.metadata
/**
* Returns column statistics (upper/lower bounds, number of Null/NaN values,
NDVs, histogram).
* @param table table object
+ * @param colNames list of column names
* @return list of ColumnStatisticsObj objects
*/
- default List<ColumnStatisticsObj>
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
+ default List<ColumnStatisticsObj>
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table,
+ List<String> colNames) {
return null;
}
+ /**
+ * Returns column statistics (upper/lower bounds, number of Null/NaN values,
NDVs, histogram).
+ * @param table table object
+ *
+ * @deprecated since 4.1.0, will be removed in 5.0.0,
+ * use {@link #getColStatistics(org.apache.hadoop.hive.ql.metadata.Table,
List<String>)} instead.
+ */
+ @Deprecated
+ default List<ColumnStatisticsObj>
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
+ return getColStatistics(table, null);
+ }
+
/**
* Returns an aggregated column statistics for the supplied partition list
* @param table table object
@@ -340,7 +352,14 @@ default boolean
canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table tab
default boolean canComputeQueryUsingStats(Partish partish) {
return false;
}
-
+
+ /**
+ * Check if the storage handler can answer a few queries like count(1)
purely using statistics.
+ * @param table table wrapper object
+ *
+ * @deprecated since 4.0.1, will be removed in 5.0.0,
+ * use {@link #canComputeQueryUsingStats(Partish)} instead.
+ */
@Deprecated
default boolean
canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table table) {
return canComputeQueryUsingStats(Partish.buildFor(table));
@@ -657,6 +676,13 @@ default void storageHandlerCommit(Properties
commitProperties, Operation operati
throw new UnsupportedOperationException();
}
+ /**
+ * Commits the inserts for the non-native tables.
+ * @param commitProperties Commit properties which are needed for the
handler based commit
+ *
+ * @deprecated since 4.0.1, will be removed in 5.0.0,
+ * use {@link #storageHandlerCommit(Properties, Operation)} instead.
+ */
@Deprecated
default void storageHandlerCommit(Properties commitProperties, boolean
overwrite) throws HiveException {
storageHandlerCommit(commitProperties, overwrite ? Operation.IOW :
Operation.OTHER);
@@ -700,7 +726,9 @@ default boolean isTimeTravelAllowed() {
/**
* Introduced by HIVE-25457 for iceberg to query metadata table.
* @return true if the storage handler can support it
- * @deprecated Use {@link #isTableMetaRefSupported()}
+ *
+ * @deprecated since 4.0.1, will be removed in 5.0.0,
+ * use {@link #isTableMetaRefSupported()} instead.
*/
@Deprecated
default boolean isMetadataTableSupported() {
@@ -788,7 +816,6 @@ default Iterable<SnapshotContext> getSnapshotContexts(
return Collections.emptyList();
}
-
/**
* Alter table operations can rely on this to customize the
EnvironmentContext to be used during the alter table
* invocation (both on client and server side of HMS)
@@ -801,20 +828,28 @@ default void
prepareAlterTableEnvironmentContext(AbstractAlterTableDesc alterTab
/**
* Check the operation type of all snapshots which are newer than the
specified. The specified snapshot is excluded.
- * @deprecated
- * <br>Use {@link
HiveStorageHandler#getSnapshotContexts(org.apache.hadoop.hive.ql.metadata.Table
hmsTable, SnapshotContext since)}
- * and check {@link SnapshotContext.WriteOperationType#APPEND}.equals({@link
SnapshotContext#getOperation()}).
*
* @param hmsTable table metadata stored in Hive Metastore
* @param since the snapshot preceding the oldest snapshot which should be
checked.
* The value null means all should be checked.
- * @return null if table is empty, true if all snapshots are {@link
SnapshotContext.WriteOperationType#APPEND}s, false otherwise.
+ * @return null if table is empty, true if all snapshots are {@link
SnapshotContext.WriteOperationType#APPEND}s,
+ * false otherwise.
+ *
+ * @deprecated since 4.0.1, will be removed in 5.0.0,
+ * use {@link
HiveStorageHandler#getSnapshotContexts(org.apache.hadoop.hive.ql.metadata.Table,
SnapshotContext)} instead
+ * and check {@link SnapshotContext.WriteOperationType#APPEND}.equals({@link
SnapshotContext#getOperation()})
*/
@Deprecated
default Boolean hasAppendsOnly(org.apache.hadoop.hive.ql.metadata.Table
hmsTable, SnapshotContext since) {
return null;
}
+ /**
+ * Returns partitions names for the table.
+ *
+ * @deprecated since 4.0.1, will be removed in 5.0.0,
+ * use {@link #getPartitionNames(org.apache.hadoop.hive.ql.metadata.Table)}
instead.
+ */
@Deprecated
default List<String> showPartitions(DDLOperationContext context,
org.apache.hadoop.hive.ql.metadata.Table tbl) throws
UnsupportedOperationException, HiveException {
@@ -858,6 +893,11 @@ default List<String>
getPartitionNames(org.apache.hadoop.hive.ql.metadata.Table
throw new UnsupportedOperationException("Storage handler does not support
getting partition names");
}
+ /**
+ * Returns partitions names for the current table spec.
+ * @param table {@link org.apache.hadoop.hive.ql.metadata.Table} table
metadata stored in Hive Metastore
+ * @return List of partition names
+ */
default List<String>
getPartitionNames(org.apache.hadoop.hive.ql.metadata.Table table) throws
SemanticException {
return getPartitionNames(table, Maps.newHashMap());
}