(hive) branch master updated: HIVE-29147: Iceberg: Table-level column stats filter support (#5724)

dkuzmenko Sat, 06 Sep 2025 21:15:43 -0700

This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 2c18ef2d5dd HIVE-29147: Iceberg: Table-level column stats filter 
support (#5724)
2c18ef2d5dd is described below

commit 2c18ef2d5dde2a0c682eeb6dc9b7473b3b94386e
Author: Denys Kuzmenko <[email protected]>
AuthorDate: Fri Sep 5 16:30:11 2025 +0200

    HIVE-29147: Iceberg: Table-level column stats filter support (#5724)
---
 .../iceberg/mr/hive/HiveIcebergStorageHandler.java | 180 ++++++++++-----------
 .../apache/iceberg/mr/hive/IcebergTableUtil.java   |  48 +++++-
 .../iceberg/mr/hive/TestHiveIcebergStatistics.java |   2 +-
 ...berg_major_compaction_partition_evolution.q.out |  28 ++--
 .../org/apache/hadoop/hive/ql/metadata/Hive.java   |   2 +-
 .../hive/ql/metadata/HiveStorageHandler.java       |  66 ++++++--
 6 files changed, 198 insertions(+), 128 deletions(-)

diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index 88aa9c44991..337bb79e35d 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -20,6 +20,7 @@
 package org.apache.iceberg.mr.hive;
 
 import java.io.IOException;
+import java.io.Serializable;
 import java.io.UncheckedIOException;
 import java.net.URI;
 import java.net.URISyntaxException;
@@ -27,7 +28,6 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Iterator;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Map;
@@ -37,6 +37,7 @@
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.ExecutorService;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.collections.MapUtils;
@@ -45,6 +46,7 @@
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.FileUtils;
 import org.apache.hadoop.hive.common.StatsSetupConst;
@@ -89,7 +91,6 @@
 import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler;
 import org.apache.hadoop.hive.ql.metadata.DummyPartition;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
 import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
 import org.apache.hadoop.hive.ql.metadata.HiveUtils;
 import org.apache.hadoop.hive.ql.metadata.Partition;
@@ -183,7 +184,6 @@
 import org.apache.iceberg.puffin.BlobMetadata;
 import org.apache.iceberg.puffin.Puffin;
 import org.apache.iceberg.puffin.PuffinCompressionCodec;
-import org.apache.iceberg.puffin.PuffinReader;
 import org.apache.iceberg.puffin.PuffinWriter;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
@@ -196,7 +196,6 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Types;
-import org.apache.iceberg.util.ByteBuffers;
 import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.SerializationUtil;
 import org.apache.iceberg.util.SnapshotUtil;
@@ -221,12 +220,14 @@ public class HiveIcebergStorageHandler extends 
DefaultStorageHandler implements
 
   private static final String ICEBERG_URI_PREFIX = "iceberg://";
   private static final String TABLE_NAME_SEPARATOR = "..";
-  // Column index for partition metadata table
-  public static final String COPY_ON_WRITE = 
RowLevelOperationMode.COPY_ON_WRITE.modeName();
-  public static final String MERGE_ON_READ = 
RowLevelOperationMode.MERGE_ON_READ.modeName();
+  public static final String TABLE_DEFAULT_LOCATION = "TABLE_DEFAULT_LOCATION";
+
+  private static final String SPEC_ID = "spec-id";
+  private static final String PARTITION = "partition";
   public static final String STATS = "/stats/snap-";
 
-  public static final String TABLE_DEFAULT_LOCATION = "TABLE_DEFAULT_LOCATION";
+  public static final String COPY_ON_WRITE = 
RowLevelOperationMode.COPY_ON_WRITE.modeName();
+  public static final String MERGE_ON_READ = 
RowLevelOperationMode.MERGE_ON_READ.modeName();
 
   private static final List<VirtualColumn> ACID_VIRTUAL_COLS = 
ImmutableList.of(
       PARTITION_SPEC_ID, PARTITION_HASH, FILE_PATH, ROW_POSITION, 
PARTITION_PROJECTION);
@@ -601,20 +602,27 @@ private boolean writeColStats(List<ColumnStatistics> 
colStats, Table tbl) {
         long snapshotId = tbl.currentSnapshot().snapshotId();
         long snapshotSequenceNumber = tbl.currentSnapshot().sequenceNumber();
 
-        colStats.forEach(statsObj -> {
-          byte[] serializeColStats = SerializationUtils.serialize(statsObj);
-          puffinWriter.add(
-            new Blob(
-              ColumnStatisticsObj.class.getSimpleName(),
-              ImmutableList.of(1),
-              snapshotId,
-              snapshotSequenceNumber,
-              ByteBuffer.wrap(serializeColStats),
-              PuffinCompressionCodec.NONE,
-              ImmutableMap.of("partition",
-                  String.valueOf(statsObj.getStatsDesc().getPartName()))
-            ));
+        colStats.forEach(stats -> {
+          boolean isTblLevel = stats.getStatsDesc().isIsTblLevel();
+
+          for (Serializable statsObj : isTblLevel ? stats.getStatsObj() : 
Collections.singletonList(stats)) {
+            byte[] serializeColStats = SerializationUtils.serialize(statsObj);
+            puffinWriter.add(
+              new Blob(
+                ColumnStatisticsObj.class.getSimpleName(),
+                ImmutableList.of(isTblLevel ? tbl.spec().schema().findField(
+                    ((ColumnStatisticsObj) statsObj).getColName()).fieldId() : 
1),
+                snapshotId,
+                snapshotSequenceNumber,
+                ByteBuffer.wrap(serializeColStats),
+                PuffinCompressionCodec.NONE,
+                isTblLevel ?
+                    ImmutableMap.of(SPEC_ID, 
String.valueOf(tbl.spec().specId())) :
+                    ImmutableMap.of(PARTITION, 
String.valueOf(stats.getStatsDesc().getPartName()))
+              ));
+          }
         });
+
         puffinWriter.finish();
 
         statisticsFile =
@@ -628,7 +636,13 @@ private boolean writeColStats(List<ColumnStatistics> 
colStats, Table tbl) {
                     .collect(ImmutableList.toImmutableList())
             );
       } catch (IOException e) {
-        LOG.warn("Unable to write stats to puffin file {}", e.getMessage());
+        LOG.warn("Unable to write column stats to the Puffin file: {}", 
e.getMessage());
+
+        Path path = new Path(statsPath);
+        FileSystem fs = path.getFileSystem(conf);
+        if (fs.exists(path)) {
+          fs.delete(path, false);
+        }
         return false;
       }
       tbl.updateStatistics()
@@ -637,7 +651,7 @@ private boolean writeColStats(List<ColumnStatistics> 
colStats, Table tbl) {
       return true;
 
     } catch (Exception e) {
-      LOG.warn("Unable to invalidate or merge stats: {}", e.getMessage());
+      LOG.warn("Unable to invalidate or merge column stats: {}", 
e.getMessage());
     }
     return false;
   }
@@ -653,21 +667,32 @@ public boolean 
canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table
   }
 
   private boolean canProvideColStats(Table table, long snapshotId) {
-    return IcebergTableUtil.getColStatsPath(table, snapshotId).isPresent();
+    return IcebergTableUtil.getColStatsPath(table, snapshotId) != null;
   }
 
   @Override
-  public List<ColumnStatisticsObj> 
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+  public List<ColumnStatisticsObj> 
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable,
+        List<String> colNames) {
     Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+
     Snapshot snapshot = IcebergTableUtil.getTableSnapshot(table, hmsTable);
+    if (snapshot == null) {
+      return Lists.newArrayList();
+    }
 
-    ColumnStatistics emptyStats = new ColumnStatistics();
-    if (snapshot != null) {
-      return IcebergTableUtil.getColStatsPath(table, snapshot.snapshotId())
-        .map(statsPath -> readColStats(table, statsPath, null).getFirst())
-        .orElse(emptyStats).getStatsObj();
+    Predicate<BlobMetadata> filter;
+    if (colNames != null) {
+      Set<String> columns = Sets.newHashSet(colNames);
+      filter = metadata -> {
+        int specId = Integer.parseInt(metadata.properties().get(SPEC_ID));
+        String column = 
table.specs().get(specId).schema().findColumnName(metadata.inputFields().getFirst());
+        return columns.contains(column);
+      };
+    } else {
+      filter = null;
     }
-    return emptyStats.getStatsObj();
+
+    return IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter);
   }
 
   @Override
@@ -684,9 +709,10 @@ public AggrStats 
getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hms
         MetastoreConf.ConfVars.STATS_NDV_DENSITY_FUNCTION);
     double ndvTuner = MetastoreConf.getDoubleVar(getConf(), 
MetastoreConf.ConfVars.STATS_NDV_TUNER);
 
-    List<ColumnStatistics> partStats = IcebergTableUtil.getColStatsPath(table, 
snapshot.snapshotId())
-        .map(statsPath -> readColStats(table, statsPath, 
Sets.newHashSet(partNames)))
-        .orElse(Collections.emptyList());
+    Set<String> partitions = Sets.newHashSet(partNames);
+    Predicate<BlobMetadata> filter = metadata -> 
partitions.contains(metadata.properties().get(PARTITION));
+
+    List<ColumnStatistics> partStats = IcebergTableUtil.readColStats(table, 
snapshot.snapshotId(), filter);
 
     partStats.forEach(colStats ->
         colStats.getStatsObj().removeIf(statsObj -> 
!colNames.contains(statsObj.getColName())));
@@ -700,30 +726,6 @@ public AggrStats 
getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hms
     return new AggrStats(colStatsList, partStats.size());
   }
 
-  private List<ColumnStatistics> readColStats(Table table, Path statsPath, 
Set<String> partNames) {
-    List<ColumnStatistics> colStats = Lists.newArrayList();
-
-    try (PuffinReader reader = 
Puffin.read(table.io().newInputFile(statsPath.toString())).build()) {
-      List<BlobMetadata> blobMetadata = reader.fileMetadata().blobs();
-
-      if (partNames != null) {
-        blobMetadata = blobMetadata.stream()
-            .filter(metadata -> 
partNames.contains(metadata.properties().get("partition")))
-            .collect(Collectors.toList());
-      }
-      Iterator<ByteBuffer> it = 
Iterables.transform(reader.readAll(blobMetadata), Pair::second).iterator();
-      LOG.info("Using col stats from : {}", statsPath);
-
-      while (it.hasNext()) {
-        byte[] byteBuffer = ByteBuffers.toByteArray(it.next());
-        colStats.add(SerializationUtils.deserialize(byteBuffer));
-      }
-    } catch (Exception e) {
-      LOG.warn(" Unable to read col stats: ", e);
-    }
-    return colStats;
-  }
-
   @Override
   public boolean canComputeQueryUsingStats(Partish partish) {
     org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
@@ -757,28 +759,30 @@ private String getStatsSource() {
   private boolean shouldRewriteColStats(Table tbl) {
     return 
SessionStateUtil.getQueryState(conf).map(QueryState::getHiveOperation)
               .filter(opType -> HiveOperation.ANALYZE_TABLE == 
opType).isPresent() ||
-          IcebergTableUtil.getColStatsPath(tbl).isPresent();
+          IcebergTableUtil.getColStatsPath(tbl) != null;
   }
 
   private void checkAndMergeColStats(List<ColumnStatistics> statsNew, Table 
tbl) throws InvalidObjectException {
     Long previousSnapshotId = tbl.currentSnapshot().parentId();
     if (previousSnapshotId != null && canProvideColStats(tbl, 
previousSnapshotId)) {
-      List<ColumnStatistics> statsOld = IcebergTableUtil.getColStatsPath(tbl, 
previousSnapshotId)
-          .map(statsPath -> readColStats(tbl, statsPath, null))
-          .orElse(Collections.emptyList());
 
       boolean isTblLevel = statsNew.getFirst().getStatsDesc().isIsTblLevel();
       Map<String, ColumnStatistics> oldStatsMap = Maps.newHashMap();
 
+      List<?> statsOld = IcebergTableUtil.readColStats(tbl, 
previousSnapshotId, null);
+
       if (!isTblLevel) {
-        for (ColumnStatistics statsObjOld : statsOld) {
+        for (ColumnStatistics statsObjOld : (List<ColumnStatistics>) statsOld) 
{
           oldStatsMap.put(statsObjOld.getStatsDesc().getPartName(), 
statsObjOld);
         }
+      } else {
+        statsOld = Collections.singletonList(
+            new ColumnStatistics(null, (List<ColumnStatisticsObj>) statsOld));
       }
       for (ColumnStatistics statsObjNew : statsNew) {
         String partitionKey = statsObjNew.getStatsDesc().getPartName();
         ColumnStatistics statsObjOld = isTblLevel ?
-            statsOld.getFirst() : oldStatsMap.get(partitionKey);
+            (ColumnStatistics) statsOld.getFirst() : 
oldStatsMap.get(partitionKey);
 
         if (statsObjOld != null && statsObjOld.getStatsObjSize() != 0 && 
!statsObjNew.getStatsObj().isEmpty()) {
           MetaStoreServerUtils.mergeColStats(statsObjNew, statsObjOld);
@@ -1864,19 +1868,6 @@ public void addResourcesForCreateTable(Map<String, 
String> tblProps, HiveConf hi
     }
   }
 
-  /**
-   * Check the operation type of all snapshots which are newer than the 
specified. The specified snapshot is excluded.
-   * @param hmsTable table metadata stored in Hive Metastore
-   * @param since the snapshot preceding the oldest snapshot which should be 
checked.
-   *              The value null means all should be checked.
-   * @return null if table is empty, true if all snapshots are {@link 
SnapshotContext.WriteOperationType#APPEND}s,
-   * false otherwise.
-   *
-   * @deprecated
-   * <br>Use {@link HiveStorageHandler#getSnapshotContexts(
-   * org.apache.hadoop.hive.ql.metadata.Table hmsTable, SnapshotContext since)}
-   * and check {@link SnapshotContext.WriteOperationType#APPEND}.equals({@link 
SnapshotContext#getOperation()}).
-   */
   @Deprecated
   @Override
   public Boolean hasAppendsOnly(org.apache.hadoop.hive.ql.metadata.Table 
hmsTable, SnapshotContext since) {
@@ -2132,23 +2123,24 @@ public List<Partition> 
getPartitionsByExpr(org.apache.hadoop.hive.ql.metadata.Ta
         .caseSensitive(false).includeColumnStats().ignoreResiduals();
 
     try (CloseableIterable<FileScanTask> tasks = scan.planFiles()) {
-      FluentIterable.from(tasks).filter(task -> 
task.spec().isPartitioned()).forEach(task -> {
-        DataFile file = task.file();
-        PartitionSpec spec = task.spec();
-
-        if (latestSpecOnly == null || latestSpecOnly && file.specId() == 
tableSpecId ||
-              !latestSpecOnly && file.specId() != tableSpecId) {
-
-          PartitionData partitionData = 
IcebergTableUtil.toPartitionData(task.partition(), spec.partitionType());
-          String partName = spec.partitionToPath(partitionData);
-
-          Map<String, String> partSpecMap = Maps.newLinkedHashMap();
-          Warehouse.makeSpecFromName(partSpecMap, new Path(partName), null);
-
-          DummyPartition partition = new DummyPartition(hmsTable, partName, 
partSpecMap);
-          partitions.add(partition);
-        }
-      });
+      FluentIterable.from(tasks)
+          .filter(task -> task.spec().isPartitioned())
+          .forEach(task -> {
+            DataFile file = task.file();
+            PartitionSpec spec = task.spec();
+
+            if (latestSpecOnly == null || latestSpecOnly && file.specId() == 
tableSpecId ||
+                  !latestSpecOnly && file.specId() != tableSpecId) {
+              PartitionData partitionData = 
IcebergTableUtil.toPartitionData(task.partition(), spec.partitionType());
+              String partName = spec.partitionToPath(partitionData);
+
+              Map<String, String> partSpecMap = Maps.newLinkedHashMap();
+              Warehouse.makeSpecFromName(partSpecMap, new Path(partName), 
null);
+
+              DummyPartition partition = new DummyPartition(hmsTable, 
partName, partSpecMap);
+              partitions.add(partition);
+            }
+          });
     } catch (IOException e) {
       throw new SemanticException(String.format("Error while fetching the 
partitions due to: %s", e));
     }
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 098928bb204..3fe8b3838a2 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -20,10 +20,12 @@
 package org.apache.iceberg.mr.hive;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.time.ZoneId;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
@@ -34,7 +36,9 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.BinaryOperator;
 import java.util.function.Function;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;
+import org.apache.commons.lang3.SerializationUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -79,6 +83,7 @@
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.SnapshotRef;
+import org.apache.iceberg.StatisticsFile;
 import org.apache.iceberg.StructLike;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
@@ -91,12 +96,18 @@
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.mr.Catalogs;
 import org.apache.iceberg.mr.InputFormatConfig;
+import org.apache.iceberg.puffin.BlobMetadata;
+import org.apache.iceberg.puffin.Puffin;
+import org.apache.iceberg.puffin.PuffinReader;
 import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ByteBuffers;
+import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.PropertyUtil;
 import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.iceberg.util.StructProjection;
@@ -223,18 +234,18 @@ static Snapshot getTableSnapshot(Table table, String 
snapshotRef) {
     return table.currentSnapshot();
   }
 
-  static Optional<Path> getColStatsPath(Table table) {
+  static String getColStatsPath(Table table) {
     return getColStatsPath(table, table.currentSnapshot().snapshotId());
   }
 
-  static Optional<Path> getColStatsPath(Table table, long snapshotId) {
+  static String getColStatsPath(Table table, long snapshotId) {
     return table.statisticsFiles().stream()
       .filter(stats -> stats.snapshotId() == snapshotId)
       .filter(stats -> stats.blobMetadata().stream()
         .anyMatch(metadata -> 
ColumnStatisticsObj.class.getSimpleName().equals(metadata.type()))
       )
-      .map(stats -> new Path(stats.path()))
-      .findAny();
+      .map(StatisticsFile::path)
+      .findAny().orElse(null);
   }
 
   static PartitionStatisticsFile getPartitionStatsFile(Table table, long 
snapshotId) {
@@ -590,6 +601,33 @@ public static TransformSpec getTransformSpec(Table table, 
String transformName,
     return spec;
   }
 
+  public static <T> List<T> readColStats(Table table, Long snapshotId, 
Predicate<BlobMetadata> filter) {
+    List<T> colStats = Lists.newArrayList();
+
+    String statsPath  = IcebergTableUtil.getColStatsPath(table, snapshotId);
+    if (statsPath == null) {
+      return colStats;
+    }
+    try (PuffinReader reader = 
Puffin.read(table.io().newInputFile(statsPath)).build()) {
+      List<BlobMetadata> blobMetadata = reader.fileMetadata().blobs();
+
+      if (filter != null) {
+        blobMetadata = blobMetadata.stream().filter(filter)
+          .toList();
+      }
+      Iterator<ByteBuffer> it = 
Iterables.transform(reader.readAll(blobMetadata), Pair::second).iterator();
+      LOG.info("Using column stats from: {}", statsPath);
+
+      while (it.hasNext()) {
+        byte[] byteBuffer = ByteBuffers.toByteArray(it.next());
+        colStats.add(SerializationUtils.deserialize(byteBuffer));
+      }
+    } catch (Exception e) {
+      LOG.warn("Unable to read column stats: {}", e.getMessage());
+    }
+    return colStats;
+  }
+
   public static ExecutorService newDeleteThreadPool(String completeName, int 
numThreads) {
     AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
     return Executors.newFixedThreadPool(numThreads, runnable -> {
@@ -608,7 +646,7 @@ public static boolean hasUndergonePartitionEvolution(Table 
table) {
             .anyMatch(id -> id != table.spec().specId());
   }
 
-  public static <T extends ContentFile> Set<String> getPartitionNames(Table 
icebergTable, Iterable<T> files,
+  public static <T extends ContentFile<?>> Set<String> getPartitionNames(Table 
icebergTable, Iterable<T> files,
       Boolean latestSpecOnly) {
     Set<String> partitions = Sets.newHashSet();
     int tableSpecId = icebergTable.spec().specId();
diff --git 
a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
 
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
index 832676df55b..7031f323d29 100644
--- 
a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
+++ 
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java
@@ -345,7 +345,7 @@ public void testIcebergColStatsPath() throws IOException {
 
     table.refresh();
 
-    Path tblColPath = IcebergTableUtil.getColStatsPath(table).orElse(null);
+    Path tblColPath = new Path(IcebergTableUtil.getColStatsPath(table));
     Assert.assertNotNull(tblColPath);
     // Check that if colPath is created correctly
     
Assert.assertTrue(tblColPath.getFileSystem(shell.getHiveConf()).exists(tblColPath));
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
index 10dffc77d9b..b7494bd383b 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_major_compaction_partition_evolution.q.out
@@ -380,14 +380,14 @@ STAGE PLANS:
                 TableScan
                   alias: ice_orc
                   Snapshot ref: tag_v1
-                  Statistics: Num rows: 4 Data size: 768 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                   Select Operator
                     expressions: first_name (type: string), last_name (type: 
string), dept_id (type: bigint), team_id (type: bigint), company_id (type: 
bigint)
                     outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                    Statistics: Num rows: 4 Data size: 768 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                     File Output Operator
                       compressed: false
-                      Statistics: Num rows: 4 Data size: 768 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                       table:
                           input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                           output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -423,14 +423,14 @@ STAGE PLANS:
                 TableScan
                   alias: ice_orc
                   Snapshot ref: tag_v2
-                  Statistics: Num rows: 8 Data size: 1536 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                   Select Operator
                     expressions: first_name (type: string), last_name (type: 
string), dept_id (type: bigint), team_id (type: bigint), company_id (type: 
bigint)
                     outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                    Statistics: Num rows: 8 Data size: 1536 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                     File Output Operator
                       compressed: false
-                      Statistics: Num rows: 8 Data size: 1536 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                       table:
                           input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                           output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -595,17 +595,17 @@ STAGE PLANS:
                   alias: ice_orc
                   filterExpr: company_id is not null (type: boolean)
                   Snapshot ref: tag_v1
-                  Statistics: Num rows: 4 Data size: 768 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                   Filter Operator
                     predicate: company_id is not null (type: boolean)
-                    Statistics: Num rows: 3 Data size: 576 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       expressions: first_name (type: string), last_name (type: 
string), dept_id (type: bigint), team_id (type: bigint), company_id (type: 
bigint)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                      Statistics: Num rows: 3 Data size: 576 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 3 Data size: 576 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 4 Data size: 1568 Basic stats: 
COMPLETE Column stats: NONE
                         table:
                             input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                             output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -642,17 +642,17 @@ STAGE PLANS:
                   alias: ice_orc
                   filterExpr: company_id is not null (type: boolean)
                   Snapshot ref: tag_v2
-                  Statistics: Num rows: 8 Data size: 1536 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                   Filter Operator
                     predicate: company_id is not null (type: boolean)
-                    Statistics: Num rows: 7 Data size: 1344 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       expressions: first_name (type: string), last_name (type: 
string), dept_id (type: bigint), team_id (type: bigint), company_id (type: 
bigint)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                      Statistics: Num rows: 7 Data size: 1344 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 7 Data size: 1344 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 8 Data size: 3136 Basic stats: 
COMPLETE Column stats: NONE
                         table:
                             input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                             output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java 
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index 82964ddb3c1..6a332a10f19 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -6181,7 +6181,7 @@ public List<ColumnStatisticsObj> getTableColumnStatistics(
     List<ColumnStatisticsObj> retv = null;
     try {
       if (tbl.isNonNative() && 
tbl.getStorageHandler().canProvideColStatistics(tbl)) {
-        return tbl.getStorageHandler().getColStatistics(tbl);
+        return tbl.getStorageHandler().getColStatistics(tbl, colNames);
       }
       if (checkTransactional) {
         AcidUtils.TableSnapshot tableSnapshot = 
AcidUtils.getTableSnapshot(conf, tbl);
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java 
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index 5f29f1bb244..dda0873b6a2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -182,16 +182,14 @@ public interface HiveStorageHandler extends Configurable {
   void configureOutputJobProperties(TableDesc tableDesc, Map<String, String> 
jobProperties);
 
   /**
-   * Deprecated use configureInputJobProperties/configureOutputJobProperties
-   * methods instead.
-   *
    * Configures properties for a job based on the definition of the
    * source or target table it accesses.
    *
    * @param tableDesc descriptor for the table being accessed
+   * @param jobProperties receives properties copied or transformed from the 
table properties
    *
-   * @param jobProperties receives properties copied or transformed
-   * from the table properties
+   * @deprecated since 4.0.1, will be removed in 5.0.0,
+   * use {@link #configureInputJobProperties} and {@link 
#configureOutputJobProperties} instead.
    */
   @Deprecated
   void configureTableJobProperties(TableDesc tableDesc, Map<String, String> 
jobProperties);
@@ -286,12 +284,26 @@ default boolean 
canProvidePartitionStatistics(org.apache.hadoop.hive.ql.metadata
   /**
    * Returns column statistics (upper/lower bounds, number of Null/NaN values, 
NDVs, histogram).
    * @param table table object
+   * @param colNames list of column names            
    * @return list of ColumnStatisticsObj objects
    */
-  default List<ColumnStatisticsObj> 
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
+  default List<ColumnStatisticsObj> 
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table, 
+        List<String> colNames) {
     return null;
   }
 
+  /**
+   * Returns column statistics (upper/lower bounds, number of Null/NaN values, 
NDVs, histogram).
+   * @param table table object
+   *
+   * @deprecated since 4.1.0, will be removed in 5.0.0,
+   * use {@link #getColStatistics(org.apache.hadoop.hive.ql.metadata.Table, 
List<String>)} instead.
+   */
+  @Deprecated
+  default List<ColumnStatisticsObj> 
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
+    return getColStatistics(table, null);
+  }
+
   /**
    * Returns an aggregated column statistics for the supplied partition list
    * @param table table object
@@ -340,7 +352,14 @@ default boolean 
canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table tab
   default boolean canComputeQueryUsingStats(Partish partish) {
     return false;
   }
-  
+
+  /**
+   * Check if the storage handler can answer a few queries like count(1) 
purely using statistics.
+   * @param table table wrapper object
+   *
+   * @deprecated since 4.0.1, will be removed in 5.0.0,
+   * use {@link #canComputeQueryUsingStats(Partish)} instead.
+   */
   @Deprecated
   default boolean 
canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table table) {
     return canComputeQueryUsingStats(Partish.buildFor(table));
@@ -657,6 +676,13 @@ default void storageHandlerCommit(Properties 
commitProperties, Operation operati
     throw new UnsupportedOperationException();
   }
 
+  /**
+   * Commits the inserts for the non-native tables.
+   * @param commitProperties Commit properties which are needed for the 
handler based commit
+   *
+   * @deprecated since 4.0.1, will be removed in 5.0.0,
+   * use {@link #storageHandlerCommit(Properties, Operation)} instead.
+   */
   @Deprecated
   default void storageHandlerCommit(Properties commitProperties, boolean 
overwrite) throws HiveException {
     storageHandlerCommit(commitProperties, overwrite ? Operation.IOW : 
Operation.OTHER);
@@ -700,7 +726,9 @@ default boolean isTimeTravelAllowed() {
   /**
    * Introduced by HIVE-25457 for iceberg to query metadata table.
    * @return true if the storage handler can support it
-   * @deprecated Use {@link #isTableMetaRefSupported()}
+   *
+   * @deprecated since 4.0.1, will be removed in 5.0.0,
+   * use {@link #isTableMetaRefSupported()} instead.
    */
   @Deprecated
   default boolean isMetadataTableSupported() {
@@ -788,7 +816,6 @@ default Iterable<SnapshotContext> getSnapshotContexts(
     return Collections.emptyList();
   }
 
-
   /**
    * Alter table operations can rely on this to customize the 
EnvironmentContext to be used during the alter table
    * invocation (both on client and server side of HMS)
@@ -801,20 +828,28 @@ default void 
prepareAlterTableEnvironmentContext(AbstractAlterTableDesc alterTab
 
   /**
    * Check the operation type of all snapshots which are newer than the 
specified. The specified snapshot is excluded.
-   * @deprecated
-   * <br>Use {@link 
HiveStorageHandler#getSnapshotContexts(org.apache.hadoop.hive.ql.metadata.Table 
hmsTable, SnapshotContext since)}
-   * and check {@link SnapshotContext.WriteOperationType#APPEND}.equals({@link 
SnapshotContext#getOperation()}).
    *
    * @param hmsTable table metadata stored in Hive Metastore
    * @param since the snapshot preceding the oldest snapshot which should be 
checked.
    *              The value null means all should be checked.
-   * @return null if table is empty, true if all snapshots are {@link 
SnapshotContext.WriteOperationType#APPEND}s, false otherwise.
+   * @return null if table is empty, true if all snapshots are {@link 
SnapshotContext.WriteOperationType#APPEND}s,
+   * false otherwise.
+   *
+   * @deprecated since 4.0.1, will be removed in 5.0.0,
+   * use {@link 
HiveStorageHandler#getSnapshotContexts(org.apache.hadoop.hive.ql.metadata.Table,
 SnapshotContext)} instead
+   * and check {@link SnapshotContext.WriteOperationType#APPEND}.equals({@link 
SnapshotContext#getOperation()})
    */
   @Deprecated
   default Boolean hasAppendsOnly(org.apache.hadoop.hive.ql.metadata.Table 
hmsTable, SnapshotContext since) {
     return null;
   }
 
+  /**
+   * Returns partitions names for the table.
+   *
+   * @deprecated since 4.0.1, will be removed in 5.0.0,
+   * use {@link #getPartitionNames(org.apache.hadoop.hive.ql.metadata.Table)} 
instead.
+   */
   @Deprecated
   default List<String> showPartitions(DDLOperationContext context,
       org.apache.hadoop.hive.ql.metadata.Table tbl) throws 
UnsupportedOperationException, HiveException {
@@ -858,6 +893,11 @@ default List<String> 
getPartitionNames(org.apache.hadoop.hive.ql.metadata.Table
     throw new UnsupportedOperationException("Storage handler does not support 
getting partition names");
   }
 
+  /**
+   * Returns partitions names for the current table spec.
+   * @param table {@link org.apache.hadoop.hive.ql.metadata.Table} table 
metadata stored in Hive Metastore
+   * @return List of partition names
+   */
   default List<String> 
getPartitionNames(org.apache.hadoop.hive.ql.metadata.Table table) throws 
SemanticException {
     return getPartitionNames(table, Maps.newHashMap());
   }

(hive) branch master updated: HIVE-29147: Iceberg: Table-level column stats filter support (#5724)

Reply via email to