Copilot commented on code in PR #6438: URL: https://github.com/apache/hive/pull/6438#discussion_r3151275823
########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * + * If some table or partition column statistics are older than the configured retention interval + * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. + */ +public class StatisticsManagementTask implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + // The 2 configs for users to set in the conf + // this is an optional table property, if this property does not exist for a table, then it is not excluded + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + + private static final Lock lock = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + // we modify conf in setupConf(), so we make a copy + this.conf = configuration; + } + + @Override + public Configuration getConf() { + return conf; + } + + // what needs to be included in this run() method: + // get the "lastAnalyzed" information from TAB_COL_STATS and find all the tables need to be deleted + // delete all column stats + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!lock.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + + String filter = "lastAnalyzed < threshold"; + String paramStr = "long threshold"; + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + RawStore ms = HMSHandler.getMSForConf(conf); + PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); + + Query q = null; + try { + q = pm.newQuery(MTableColumnStatistics.class); + q.setFilter(filter); + q.declareParameters(paramStr); + // only fetch required fields, avoid loading heavy MTable objects + q.setResult( + "table.database.name, " + + "table.tableName, " + + "partitionName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" + ); + @SuppressWarnings("unchecked") + List<Object[]> rows = (List<Object[]>) q.execute(lastAnalyzedThreshold); + + for (Object[] row : rows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; // can be null for table-level stats + String excludeVal = (String) row[3]; // can be null + + // exclude check uses projected param value + if (excludeVal != null) { + LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); Review Comment: The exclude check treats any non-null table parameter value as excluded; if the property is present but set to "false" it will still skip deletion. Check the value (e.g., equalsIgnoreCase("true")) instead of only nullness. ```suggestion if ("true".equalsIgnoreCase(excludeVal)) { LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set to true on the table.", dbName, tblName); ``` ########## standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java: ########## @@ -695,4 +697,64 @@ static DescriptiveStatistics benchmarkPartitionManagement(@NotNull MicroBenchmar } } + static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchmark bench, + @NotNull BenchData data, + int tableCount) { + + String dbName = data.dbName + "_" + tableCount; + String tableNamePrefix = data.tableName; + final HMSClient client = data.getClient(); + final StatisticsManagementTask statsTask = new StatisticsManagementTask(); + try { + client.getHadoopConf().set("hive.metastore.uris", client.getServerURI().toString()); + client.getHadoopConf().set("metastore.statistics.management.database.pattern", dbName); Review Comment: This sets a config key (`metastore.statistics.management.database.pattern`) that doesn't appear to be defined in `MetastoreConf` or read by `StatisticsManagementTask`, so it currently has no effect. Either wire this config into the task or drop it from the benchmark setup to avoid misleading results. ```suggestion ``` ########## standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java: ########## @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore; + +import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_CATALOG_NAME; +import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.client.builder.DatabaseBuilder; +import org.apache.hadoop.hive.metastore.client.builder.TableBuilder; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars; +import org.apache.hadoop.hive.metastore.security.HadoopThriftAuthBridge; +import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil; +import org.apache.thrift.TException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +@Category(MetastoreUnitTest.class) +public class TestStatisticsManagement { + + private IMetaStoreClient client; + private Configuration conf; + + @Before + public void setUp() throws Exception { + conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setVar(conf, ConfVars.METASTORE_METADATA_TRANSFORMER_CLASS, " "); + MetaStoreTestUtils.setConfForStandloneMode(conf); + conf.setBoolean(ConfVars.MULTITHREADED.getVarname(), false); + conf.setBoolean(ConfVars.HIVE_IN_TEST.getVarname(), true); + + // enable stats auto deletion, set up a short retention so threshold check triggers easily + MetastoreConf.setBoolVar(conf, ConfVars.STATISTICS_AUTO_DELETION, true); + MetastoreConf.setTimeVar(conf, ConfVars.STATISTICS_RETENTION_PERIOD, 1, TimeUnit.DAYS); + + MetaStoreTestUtils.startMetaStoreWithRetry(HadoopThriftAuthBridge.getBridge(), conf); + TestTxnDbUtil.setConfValues(conf); + TestTxnDbUtil.prepDb(conf); + + client = new HiveMetaStoreClient(conf); + } + + @After + public void tearDown() throws Exception { + if (client != null) { + // Drop any leftover databases, similar to TestPartitionManagement.java + List<String> dbs = client.getAllDatabases(DEFAULT_CATALOG_NAME); + for (String db : dbs) { + if (!db.equalsIgnoreCase(DEFAULT_DATABASE_NAME)) { + client.dropDatabase(DEFAULT_CATALOG_NAME, db, true, false, true); + } + } + } + try { + if (client != null) { + client.close(); + } + } finally { + client = null; + } + } + + @Test + public void testExpiredTableColStatsAreDeleted() throws Exception { + String dbName = "stats_db1"; + String tableName = "tbl1"; + createDbAndTable(dbName, tableName, false); + // create a column stats entry (table-level) + writeTableLevelColStats(dbName, tableName, "c1"); + // ensure stats exists + assertHasTableColStats(dbName, tableName, "c1"); + // make lastAnalyzed older than the threshold + makeAllTableColStatsOlderThanRetention(dbName, tableName); + + runStatisticsManagementTask(conf); + assertNoTableColStats(dbName, tableName, "c1"); + } + + @Test + public void testExcludedTableStatsAreNotDeleted() throws Exception { + String dbName = "stats_db2"; + String tableName = "tbl2"; + // Create a database and table that ARE excluded from auto deletion. + createDbAndTable(dbName, tableName, true); + writeTableLevelColStats(dbName, tableName, "c1"); + assertHasTableColStats(dbName, tableName, "c1"); + + // Manually set lastAnalyzed to a very old timestamp so it would normally be expired. + makeAllTableColStatsOlderThanRetention(dbName, tableName); + + runStatisticsManagementTask(conf); + + // Verify that stats are still present because the table is excluded. + assertHasTableColStats(dbName, tableName, "c1"); + } + + private void createDbAndTable(String dbName, String tableName, boolean exclude) throws Exception { + Database db; + if (!DEFAULT_DATABASE_NAME.equals(dbName)) { + db = new DatabaseBuilder() + .setName(dbName) + .setCatalogName(DEFAULT_CATALOG_NAME) + .create(client, conf); + } else { + db = client.getDatabase(DEFAULT_CATALOG_NAME, DEFAULT_DATABASE_NAME); + } + + // Build a simple test table with two columns. + TableBuilder tb = new TableBuilder() + .inDb(db) + .setTableName(tableName) + .addCol("c1", "double") + .addCol("c2", "string") + .setInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") + .setOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"); + + Table t = tb.build(conf); + + // If requested, mark this table as excluded from automatic stats deletion. + if (exclude) { + t.getParameters().put(StatisticsManagementTask.STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY, "true"); + } + client.createTable(t); + client.flushCache(); + } + + private void writeTableLevelColStats(String db, String tbl, String col) throws TException { + // Create a stats object for one column. + ColumnStatisticsObj obj = new ColumnStatisticsObj(); + obj.setColName(col); + obj.setColType("double"); + + // Fill in minimal double-column statistics data. + DoubleColumnStatsData doubleData = new DoubleColumnStatsData(); + doubleData.setNumNulls(0); + doubleData.setNumDVs(1); + doubleData.setLowValue(1.0); + doubleData.setHighValue(1.0); + + ColumnStatisticsData data = new ColumnStatisticsData(); + data.setDoubleStats(doubleData); + obj.setStatsData(data); + + ColumnStatistics cs = new ColumnStatistics(); + ColumnStatisticsDesc desc = new ColumnStatisticsDesc(true, db, tbl); + desc.setCatName("hive"); + cs.addToStatsObj(obj); + + client.updateTableColumnStatistics(cs); + } Review Comment: `ColumnStatistics` is missing its `statsDesc` assignment. `updateTableColumnStatistics` dereferences `statsObj.getStatsDesc()` and will NPE unless you call `cs.setStatsDesc(desc)` (and typically set the engine if needed). ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * + * If some table or partition column statistics are older than the configured retention interval + * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. + */ +public class StatisticsManagementTask implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + // The 2 configs for users to set in the conf + // this is an optional table property, if this property does not exist for a table, then it is not excluded + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + + private static final Lock lock = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + // we modify conf in setupConf(), so we make a copy + this.conf = configuration; + } + + @Override + public Configuration getConf() { + return conf; + } + + // what needs to be included in this run() method: + // get the "lastAnalyzed" information from TAB_COL_STATS and find all the tables need to be deleted + // delete all column stats + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!lock.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + + String filter = "lastAnalyzed < threshold"; + String paramStr = "long threshold"; + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + RawStore ms = HMSHandler.getMSForConf(conf); + PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); + + Query q = null; + try { + q = pm.newQuery(MTableColumnStatistics.class); + q.setFilter(filter); + q.declareParameters(paramStr); + // only fetch required fields, avoid loading heavy MTable objects + q.setResult( + "table.database.name, " + + "table.tableName, " + + "partitionName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" + ); + @SuppressWarnings("unchecked") + List<Object[]> rows = (List<Object[]>) q.execute(lastAnalyzedThreshold); + + for (Object[] row : rows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; // can be null for table-level stats + String excludeVal = (String) row[3]; // can be null + + // exclude check uses projected param value + if (excludeVal != null) { + LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + + // decide tableLevel based on whether this stat row is table-level or partition-level + // avoids loading table partition keys / MTable + request.setTableLevel(partName == null); + msc.deleteColumnStatistics(request); + } + } finally { + if (q != null) { + q.closeAll(); + } Review Comment: The `PersistenceManager` obtained from `ObjectStore` is never closed, which can leak resources/DB connections in a periodically running task. Wrap the PM in try/finally (or try-with-resources if supported) and close it after the query completes. ```suggestion try { Query q = null; try { q = pm.newQuery(MTableColumnStatistics.class); q.setFilter(filter); q.declareParameters(paramStr); // only fetch required fields, avoid loading heavy MTable objects q.setResult( "table.database.name, " + "table.tableName, " + "partitionName, " + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" ); @SuppressWarnings("unchecked") List<Object[]> rows = (List<Object[]>) q.execute(lastAnalyzedThreshold); for (Object[] row : rows) { String dbName = (String) row[0]; String tblName = (String) row[1]; String partName = (String) row[2]; // can be null for table-level stats String excludeVal = (String) row[3]; // can be null // exclude check uses projected param value if (excludeVal != null) { LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); continue; } DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); request.setEngine("hive"); // decide tableLevel based on whether this stat row is table-level or partition-level // avoids loading table partition keys / MTable request.setTableLevel(partName == null); msc.deleteColumnStatistics(request); } } finally { if (q != null) { q.closeAll(); } } } finally { pm.close(); ``` ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * + * If some table or partition column statistics are older than the configured retention interval + * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. + */ +public class StatisticsManagementTask implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + // The 2 configs for users to set in the conf + // this is an optional table property, if this property does not exist for a table, then it is not excluded + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + + private static final Lock lock = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + // we modify conf in setupConf(), so we make a copy + this.conf = configuration; + } + + @Override + public Configuration getConf() { + return conf; + } + + // what needs to be included in this run() method: + // get the "lastAnalyzed" information from TAB_COL_STATS and find all the tables need to be deleted + // delete all column stats + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!lock.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + + String filter = "lastAnalyzed < threshold"; + String paramStr = "long threshold"; + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + RawStore ms = HMSHandler.getMSForConf(conf); + PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); + + Query q = null; + try { + q = pm.newQuery(MTableColumnStatistics.class); + q.setFilter(filter); + q.declareParameters(paramStr); + // only fetch required fields, avoid loading heavy MTable objects + q.setResult( + "table.database.name, " + + "table.tableName, " + + "partitionName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" + ); Review Comment: `MTableColumnStatistics`/`TAB_COL_STATS` doesn't have a `partitionName` field/column; this JDOQL result projection is likely to fail at runtime. If partition-level stats need handling, query `MPartitionColumnStatistics` and project `partition.partitionName` (or otherwise join to `MPartition`) instead of referencing `partitionName` directly. ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.Map; Review Comment: `java.util.Map` is imported but never used in this class; unused imports are compilation errors in Java. Remove the import (or use it) to keep the build green. ```suggestion ``` ########## standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java: ########## @@ -1290,6 +1290,21 @@ public enum ConfVars { "metastore.partition.management.table.pattern", "*", "Automatic partition management will look for tables using the specified table pattern"), + STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.statistics.management.task.frequency", + "metastore.statistics.management.task.frequency", + 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics management for tables\n" + + "with table property 'statistics.auto.deletion'='true'. Statistics management include 2 configs. \n" + + "One is 'statistics.auto.deletion', and the other is 'statistics.retention.period'. \n" + + "When 'statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n " + + "column statistics are over the retention period, and then delete the column stats. \n"), + STATISTICS_RETENTION_PERIOD("metastore.statistics.retention.period", + "metastore.statistics.retention.period", 365, TimeUnit.DAYS, "The retention period " + + "that we want to keep the stats for each table, which means if the stats are older than this period\n" + + "of time, the stats will be automatically deleted. \n"), + + STATISTICS_AUTO_DELETION("statistics.auto.deletion", "statistics.auto.deletion", true, + "Whether table/partition column statistics will be auto deleted after retention period"), Review Comment: The config docs describe enabling via a *table property* `'statistics.auto.deletion'='true'`, but `STATISTICS_AUTO_DELETION` is a metastore/server config (ConfVars) and there is no corresponding table property in this change (only an exclude property exists). Please clarify the docs to match the actual enablement mechanism and use the correct config keys (e.g., `metastore.statistics.retention.period` vs `statistics.retention.period`). ```suggestion 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics management.\n" + "Statistics management is controlled by the metastore/server config 'statistics.auto.deletion'\n" + "and uses 'metastore.statistics.retention.period' as the retention period. When\n" + "'statistics.auto.deletion' is enabled, statistics management will look for table/partition\n" + "column statistics that are older than the retention period and then delete them.\n"), STATISTICS_RETENTION_PERIOD("metastore.statistics.retention.period", "metastore.statistics.retention.period", 365, TimeUnit.DAYS, "The retention period " + "that we want to keep the stats for each table, which means if the stats are older than this period\n" + "of time, the stats will be automatically deleted. \n"), STATISTICS_AUTO_DELETION("statistics.auto.deletion", "statistics.auto.deletion", true, "Metastore/server config that controls whether table/partition column statistics are auto deleted after the retention period"), ``` ########## standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java: ########## @@ -695,4 +697,64 @@ static DescriptiveStatistics benchmarkPartitionManagement(@NotNull MicroBenchmar } } + static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchmark bench, + @NotNull BenchData data, + int tableCount) { + + String dbName = data.dbName + "_" + tableCount; + String tableNamePrefix = data.tableName; + final HMSClient client = data.getClient(); + final StatisticsManagementTask statsTask = new StatisticsManagementTask(); + try { + client.getHadoopConf().set("hive.metastore.uris", client.getServerURI().toString()); + client.getHadoopConf().set("metastore.statistics.management.database.pattern", dbName); + statsTask.setConf(client.getHadoopConf()); + + client.createDatabase(dbName); + for (int i = 0; i < tableCount; i++) { + String tableName = tableNamePrefix + "_" + i; + Util.TableBuilder tableBuilder = new Util.TableBuilder(dbName, tableName) + .withType(TableType.MANAGED_TABLE) + .withColumns(createSchema(Arrays.asList("col1:string", "col2:int"))) + .withPartitionKeys(createSchema(Collections.singletonList("part_col"))) + .withParameter("columnStatsAccurate", "true"); + + client.createTable(tableBuilder.build()); + addManyPartitionsNoException(client, dbName, tableName, null, Collections.singletonList("part_col"), 100); + + // simulate the partitions of each table which its stats has an old "lastAnalyzed" + List<Partition> partitions = client.listPartitions(dbName, tableName); + for (Partition partition : partitions) { + Map<String, String> params = partition.getParameters(); + // to manually change the "lastAnalyzed" to an old time, ex. 400 days + params.put("lastAnalyzed", String.valueOf(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400))); + } + client.alterPartitions(dbName, tableName, partitions); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + Runnable preRun = () -> LOG.debug("Preparing for benchmark..."); + + try { + DescriptiveStatistics stats = bench.measure(preRun, statsTask, null); + + // check if the stats are deleted + for (int i = 0; i < tableCount; i++) { + String tableName = tableNamePrefix + "_" + i; + List<Partition> partitions = client.listPartitions(dbName, tableName); + for (Partition partition : partitions) { + Map<String, String> params = partition.getParameters(); + if (params.containsKey("lastAnalyzed")) { + throw new AssertionError("Partition stats not deleted for table: " + tableName); Review Comment: The post-run verification checks whether the partition parameters still contain `lastAnalyzed`, but the task deletes column statistics and won't remove this partition parameter. The assertion will stay true even when stats are deleted; verify via `getPartitionColumnStatistics`/`getTableColumnStatistics` (or direct TAB_COL_STATS query) instead. ```suggestion // check if the partition column stats are deleted for (int i = 0; i < tableCount; i++) { String tableName = tableNamePrefix + "_" + i; List<Partition> partitions = client.listPartitions(dbName, tableName); List<String> partitionNames = partitions.stream() .map(Partition::getValues) .map(values -> "part_col=" + values.get(0)) .collect(Collectors.toList()); if (!partitionNames.isEmpty()) { PartitionsStatsRequest request = new PartitionsStatsRequest(); request.setDbName(dbName); request.setTblName(tableName); request.setColNames(Collections.singletonList("col2")); request.setPartNames(partitionNames); if (!client.getPartitionsStatisticsReq(request).getPartStats().isEmpty()) { throw new AssertionError("Partition column stats not deleted for table: " + tableName); ``` ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * + * If some table or partition column statistics are older than the configured retention interval + * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. + */ +public class StatisticsManagementTask implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + // The 2 configs for users to set in the conf + // this is an optional table property, if this property does not exist for a table, then it is not excluded + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + + private static final Lock lock = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + // we modify conf in setupConf(), so we make a copy + this.conf = configuration; Review Comment: `setConf` says it makes a copy (like other metastore tasks), but it assigns the passed Configuration directly. This can lead to shared-mutable Configuration across threads/tasks; use `new Configuration(configuration)` to isolate task-specific changes. ```suggestion this.conf = new Configuration(configuration); ``` ########## standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java: ########## @@ -695,4 +697,64 @@ static DescriptiveStatistics benchmarkPartitionManagement(@NotNull MicroBenchmar } } + static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchmark bench, + @NotNull BenchData data, + int tableCount) { + + String dbName = data.dbName + "_" + tableCount; + String tableNamePrefix = data.tableName; + final HMSClient client = data.getClient(); + final StatisticsManagementTask statsTask = new StatisticsManagementTask(); + try { + client.getHadoopConf().set("hive.metastore.uris", client.getServerURI().toString()); + client.getHadoopConf().set("metastore.statistics.management.database.pattern", dbName); + statsTask.setConf(client.getHadoopConf()); + + client.createDatabase(dbName); + for (int i = 0; i < tableCount; i++) { + String tableName = tableNamePrefix + "_" + i; + Util.TableBuilder tableBuilder = new Util.TableBuilder(dbName, tableName) + .withType(TableType.MANAGED_TABLE) + .withColumns(createSchema(Arrays.asList("col1:string", "col2:int"))) + .withPartitionKeys(createSchema(Collections.singletonList("part_col"))) + .withParameter("columnStatsAccurate", "true"); + + client.createTable(tableBuilder.build()); + addManyPartitionsNoException(client, dbName, tableName, null, Collections.singletonList("part_col"), 100); + + // simulate the partitions of each table which its stats has an old "lastAnalyzed" + List<Partition> partitions = client.listPartitions(dbName, tableName); + for (Partition partition : partitions) { + Map<String, String> params = partition.getParameters(); + // to manually change the "lastAnalyzed" to an old time, ex. 400 days + params.put("lastAnalyzed", String.valueOf(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400))); + } + client.alterPartitions(dbName, tableName, partitions); Review Comment: The benchmark simulates expiration by mutating partition parameters `lastAnalyzed`, but `StatisticsManagementTask` deletes rows from `TAB_COL_STATS` based on the column-stats `lastAnalyzed` field (not partition parameters). This setup won't exercise the task correctly; create/update real column statistics (table/partition) and age their `lastAnalyzed` instead. ########## standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java: ########## @@ -345,7 +348,9 @@ private void runNonAcidBenchmarks() { .add("openTxns" + '.' + howMany, () -> benchmarkOpenTxns(bench, bData, howMany)) .add("PartitionManagementTask" + "." + howMany, - () -> benchmarkPartitionManagement(bench, bData, howMany)); + () -> benchmarkPartitionManagement(bench, bData, howMany)) + .add("PartitionStatisticsTask" + "." + howMany, Review Comment: Suite entry name is inconsistent: it registers `benchmarkStatisticsManagement` under "PartitionStatisticsTask" here. Consider renaming to "StatisticsManagementTask" to match the task/class name and the earlier suite entry. ```suggestion .add("StatisticsManagementTask" + "." + howMany, ``` ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * + * If some table or partition column statistics are older than the configured retention interval + * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. + */ +public class StatisticsManagementTask implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + // The 2 configs for users to set in the conf + // this is an optional table property, if this property does not exist for a table, then it is not excluded + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + + private static final Lock lock = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + // we modify conf in setupConf(), so we make a copy + this.conf = configuration; + } + + @Override + public Configuration getConf() { + return conf; + } + + // what needs to be included in this run() method: + // get the "lastAnalyzed" information from TAB_COL_STATS and find all the tables need to be deleted + // delete all column stats + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!lock.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + + String filter = "lastAnalyzed < threshold"; + String paramStr = "long threshold"; + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + RawStore ms = HMSHandler.getMSForConf(conf); + PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); + + Query q = null; + try { + q = pm.newQuery(MTableColumnStatistics.class); + q.setFilter(filter); + q.declareParameters(paramStr); + // only fetch required fields, avoid loading heavy MTable objects + q.setResult( + "table.database.name, " + + "table.tableName, " + + "partitionName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" + ); + @SuppressWarnings("unchecked") + List<Object[]> rows = (List<Object[]>) q.execute(lastAnalyzedThreshold); + + for (Object[] row : rows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; // can be null for table-level stats + String excludeVal = (String) row[3]; // can be null + + // exclude check uses projected param value + if (excludeVal != null) { + LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + + // decide tableLevel based on whether this stat row is table-level or partition-level + // avoids loading table partition keys / MTable + request.setTableLevel(partName == null); + msc.deleteColumnStatistics(request); + } Review Comment: For partition-level stats (`partName != null`), the request does not set `part_names`. On the server side, an empty `part_names` causes `delete_column_statistics_req` to delete stats for *all partitions* of the table. Add the specific partition name(s) to the request (and dedupe/group by table+partition to avoid repeated deletes per column-stats row). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
