Copilot commented on code in PR #6438:
URL: https://github.com/apache/hive/pull/6438#discussion_r3205830044


##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java:
##########
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics;
+import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.jdo.PersistenceManager;
+import javax.jdo.Query;
+
+/**
+ * Statistics management task responsible for periodic auto-deletion of table 
and partition column
+ * statistics based on a configured retention interval.
+ *
+ * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task 
scans
+ * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code 
lastAnalyzed} timestamp
+ * is older than {@code metastore.statistics.retention.period}, and deletes 
them.

Review Comment:
   The class-level Javadoc references config keys `metastore.statistics.*`, but 
the actual ConfVars introduced/used by this task are 
`metastore.column.statistics.*` (e.g., 
`metastore.column.statistics.auto.deletion`, 
`metastore.column.statistics.retention.period`). Please update the Javadoc to 
match the real configuration keys to avoid misleading operators.
   



##########
standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java:
##########
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.metastore;
+
+import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_CATALOG_NAME;
+import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Database;
+import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
+import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.client.builder.DatabaseBuilder;
+import org.apache.hadoop.hive.metastore.client.builder.TableBuilder;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars;
+import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
+import org.apache.hadoop.hive.metastore.security.HadoopThriftAuthBridge;
+import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil;
+import org.apache.thrift.TException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import javax.jdo.PersistenceManager;
+import javax.jdo.Query;
+
+/**
+ * Unit tests for {@link StatisticsManagementTask}, verifying that expired 
table-level column
+ * statistics are deleted on schedule and that tables marked with the exclude 
property are left
+ * untouched.
+ */
+@Category(MetastoreUnitTest.class)
+public class TestStatisticsManagement {
+
+    private IMetaStoreClient client;
+    private Configuration conf;
+
+    @Before
+    public void setUp() throws Exception {
+        conf = MetastoreConf.newMetastoreConf();
+        MetastoreConf.setVar(conf, 
ConfVars.METASTORE_METADATA_TRANSFORMER_CLASS, " ");
+        MetaStoreTestUtils.setConfForStandloneMode(conf);
+        conf.setBoolean(ConfVars.MULTITHREADED.getVarname(), false);
+        conf.setBoolean(ConfVars.HIVE_IN_TEST.getVarname(), true);
+
+        // Enable stats auto deletion with a short retention so the threshold 
check triggers easily.
+        MetastoreConf.setBoolVar(conf, 
ConfVars.COLUMN_STATISTICS_AUTO_DELETION, true);
+        MetastoreConf.setTimeVar(conf, 
ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, 1, TimeUnit.DAYS);
+
+        
MetaStoreTestUtils.startMetaStoreWithRetry(HadoopThriftAuthBridge.getBridge(), 
conf);
+        TestTxnDbUtil.setConfValues(conf);
+        TestTxnDbUtil.prepDb(conf);
+
+        client = new HiveMetaStoreClient(conf);
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        if (client != null) {
+            // Drop any leftover databases, similar to 
TestPartitionManagement.java.
+            List<String> dbs = client.getAllDatabases(DEFAULT_CATALOG_NAME);
+            for (String db : dbs) {
+                if (!db.equalsIgnoreCase(DEFAULT_DATABASE_NAME)) {
+                    client.dropDatabase(DEFAULT_CATALOG_NAME, db, true, false, 
true);
+                }
+            }
+        }
+        try {
+            if (client != null) {
+                client.close();
+            }
+        } finally {
+            client = null;
+        }
+    }
+
+    @Test
+    public void testExpiredTableColStatsAreDeleted() throws Exception {
+        String dbName = "stats_db1";
+        String tableName = "tbl1";
+        createDbAndTable(dbName, tableName, false);
+        writeTableLevelColStats(dbName, tableName, "c1");
+        assertHasTableColStats(dbName, tableName, "c1");
+        makeAllTableColStatsOlderThanRetention(dbName, tableName);
+
+        runStatisticsManagementTask(conf);
+
+        assertNoTableColStats(dbName, tableName, "c1");
+    }

Review Comment:
   The production task deletes both table-level and partition-level column 
stats, but the current unit tests only cover table-level deletion and the 
exclude-table-property behavior. Please add a test that creates partition 
column stats, backdates `MPartitionColumnStatistics.lastAnalyzed`, runs the 
task, and asserts partition stats are (or are not) deleted as expected.



##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java:
##########
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics;
+import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.jdo.PersistenceManager;
+import javax.jdo.Query;
+
+/**
+ * Statistics management task responsible for periodic auto-deletion of table 
and partition column
+ * statistics based on a configured retention interval.
+ *
+ * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task 
scans
+ * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code 
lastAnalyzed} timestamp
+ * is older than {@code metastore.statistics.retention.period}, and deletes 
them.
+ * Individual tables may opt out by setting the table property
+ * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null 
value.
+ */
+public class StatisticsManagementTask extends ObjectStore implements 
MetastoreTaskThread {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(StatisticsManagementTask.class);
+
+    /**
+     * Table property key that, when present on a table, excludes it from 
automatic statistics
+     * deletion regardless of the global retention setting.
+     */
+    public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY =
+            "statistics.auto.deletion.exclude";
+
+    private static final Lock LOCK = new ReentrantLock();
+
+    private Configuration conf;
+
+    @Override
+    public long runFrequency(TimeUnit unit) {
+        return MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit);
+    }
+
+    @Override
+    public void setConf(Configuration configuration) {
+        this.conf = new Configuration(configuration);
+        super.setConf(configuration);

Review Comment:
   `setConf` makes a defensive copy into `this.conf`, but then calls 
`super.setConf(configuration)` with the original instance. Since 
`ObjectStore.setConf` initializes internal state (PMF/PM) from the provided 
Configuration, this can lead to the task using one conf while the ObjectStore 
base uses another. Pass the same defensive copy to `super.setConf(...)` for 
consistency.
   



##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java:
##########
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics;
+import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.jdo.PersistenceManager;
+import javax.jdo.Query;
+
+/**
+ * Statistics management task responsible for periodic auto-deletion of table 
and partition column
+ * statistics based on a configured retention interval.
+ *
+ * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task 
scans
+ * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code 
lastAnalyzed} timestamp
+ * is older than {@code metastore.statistics.retention.period}, and deletes 
them.
+ * Individual tables may opt out by setting the table property
+ * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null 
value.
+ */
+public class StatisticsManagementTask extends ObjectStore implements 
MetastoreTaskThread {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(StatisticsManagementTask.class);
+
+    /**
+     * Table property key that, when present on a table, excludes it from 
automatic statistics
+     * deletion regardless of the global retention setting.
+     */
+    public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY =
+            "statistics.auto.deletion.exclude";
+
+    private static final Lock LOCK = new ReentrantLock();
+
+    private Configuration conf;
+
+    @Override
+    public long runFrequency(TimeUnit unit) {
+        return MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit);
+    }
+
+    @Override
+    public void setConf(Configuration configuration) {
+        this.conf = new Configuration(configuration);
+        super.setConf(configuration);
+    }
+
+    @Override
+    public Configuration getConf() {
+        return conf;
+    }
+
+    @Override
+    public void run() {
+        LOG.debug("Auto statistics deletion started. Cleaning up 
table/partition column statistics over the retention period.");
+        long retentionMillis =
+                MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, 
TimeUnit.MILLISECONDS);
+        if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) {
+            LOG.info("Statistics auto deletion is set to off currently.");
+            return;
+        }
+        if (!LOCK.tryLock()) {
+            return;
+        }
+        try {
+            long now = System.currentTimeMillis();
+            long lastAnalyzedThreshold = (now - retentionMillis) / 1000;
+            PersistenceManager pm = getPersistenceManager();
+            boolean committed = false;
+            openTransaction();
+            try {
+                try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) {
+                    deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold);
+                    deleteExpiredPartitionColStats(pm, msc, 
lastAnalyzedThreshold);
+                }
+                committed = commitTransaction();
+            } finally {
+                if (!committed) {
+                    rollbackTransaction();
+                }
+            }
+        } catch (Exception e) {
+            LOG.error("Error during statistics auto deletion", e);
+        } finally {
+            LOCK.unlock();
+        }
+    }
+
+    /**
+     * Deletes expired table-level column statistics from {@code 
TAB_COL_STATS}.
+     * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} 
property set are skipped.
+     *
+     * @param pm                   the JDO persistence manager to use for the 
query
+     * @param msc                  the metastore client used to issue delete 
requests
+     * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed 
below this value are expired
+     * @throws Exception if the JDO query or the delete request fails
+     */
+    private void deleteExpiredTableColStats(PersistenceManager pm, 
IMetaStoreClient msc,
+                                            long lastAnalyzedThreshold) throws 
Exception {
+        Query tblQuery = null;
+        try {
+            tblQuery = pm.newQuery(MTableColumnStatistics.class);
+            tblQuery.setFilter("lastAnalyzed < threshold");
+            tblQuery.declareParameters("long threshold");
+            // partitionName does not exist on MTableColumnStatistics; omitted 
here
+            tblQuery.setResult(
+                    "table.database.name, "
+                            + "table.tableName, "
+                            + "table.parameters.get(\"" + 
STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")");
+            @SuppressWarnings("unchecked")
+            List<Object[]> tblRows = (List<Object[]>) 
tblQuery.execute(lastAnalyzedThreshold);
+            for (Object[] row : tblRows) {
+                String dbName = (String) row[0];
+                String tblName = (String) row[1];
+                String excludeVal = (String) row[2];
+                if (excludeVal != null) {
+                    LOG.info("Skipping auto deletion of table stats for {}.{} 
due to exclude property.",
+                            dbName, tblName);
+                    continue;
+                }
+                DeleteColumnStatisticsRequest request = new 
DeleteColumnStatisticsRequest(dbName, tblName);
+                request.setEngine("hive");
+                request.setTableLevel(true);

Review Comment:
   `deleteExpiredTableColStats` is filtering expired *rows* in `TAB_COL_STATS`, 
but the delete request is issued without `col_names`, which causes HMS to 
delete stats for *all columns* in the table (including non-expired ones). Also, 
because the query returns one row per column, this will invoke deletion 
repeatedly for the same table and can emit duplicate listener events (the 
rawstore delete returns true even when 0 rows are affected). Project the column 
name, group by (catalog, db, table), and pass only the expired `col_names` (and 
set `cat_name` from the query result) so only expired stats are removed once 
per table.
   



##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java:
##########
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics;
+import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.jdo.PersistenceManager;
+import javax.jdo.Query;
+
+/**
+ * Statistics management task responsible for periodic auto-deletion of table 
and partition column
+ * statistics based on a configured retention interval.
+ *
+ * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task 
scans
+ * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code 
lastAnalyzed} timestamp
+ * is older than {@code metastore.statistics.retention.period}, and deletes 
them.
+ * Individual tables may opt out by setting the table property
+ * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null 
value.
+ */
+public class StatisticsManagementTask extends ObjectStore implements 
MetastoreTaskThread {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(StatisticsManagementTask.class);
+
+    /**
+     * Table property key that, when present on a table, excludes it from 
automatic statistics
+     * deletion regardless of the global retention setting.
+     */
+    public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY =
+            "statistics.auto.deletion.exclude";
+
+    private static final Lock LOCK = new ReentrantLock();
+
+    private Configuration conf;
+
+    @Override
+    public long runFrequency(TimeUnit unit) {
+        return MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit);
+    }
+
+    @Override
+    public void setConf(Configuration configuration) {
+        this.conf = new Configuration(configuration);
+        super.setConf(configuration);
+    }
+
+    @Override
+    public Configuration getConf() {
+        return conf;
+    }
+
+    @Override
+    public void run() {
+        LOG.debug("Auto statistics deletion started. Cleaning up 
table/partition column statistics over the retention period.");
+        long retentionMillis =
+                MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, 
TimeUnit.MILLISECONDS);
+        if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) {
+            LOG.info("Statistics auto deletion is set to off currently.");
+            return;
+        }
+        if (!LOCK.tryLock()) {
+            return;
+        }
+        try {
+            long now = System.currentTimeMillis();
+            long lastAnalyzedThreshold = (now - retentionMillis) / 1000;
+            PersistenceManager pm = getPersistenceManager();
+            boolean committed = false;
+            openTransaction();
+            try {
+                try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) {
+                    deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold);
+                    deleteExpiredPartitionColStats(pm, msc, 
lastAnalyzedThreshold);

Review Comment:
   In `run()`, `openTransaction()/commitTransaction()` operate on 
`ObjectStore`'s internal `pm`, but the code uses a different 
`PersistenceManager pm = getPersistenceManager()` (which creates a new PM) for 
the JDO queries. This means the transaction does not apply to the queries, and 
the extra PM is never closed, leaking resources in a periodically running task. 
Use the ObjectStore-managed PM/transaction for the queries (or remove the 
ObjectStore transaction and properly close the PM you create).



##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java:
##########
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics;
+import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.jdo.PersistenceManager;
+import javax.jdo.Query;
+
+/**
+ * Statistics management task responsible for periodic auto-deletion of table 
and partition column
+ * statistics based on a configured retention interval.
+ *
+ * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task 
scans
+ * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code 
lastAnalyzed} timestamp
+ * is older than {@code metastore.statistics.retention.period}, and deletes 
them.
+ * Individual tables may opt out by setting the table property
+ * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null 
value.
+ */
+public class StatisticsManagementTask extends ObjectStore implements 
MetastoreTaskThread {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(StatisticsManagementTask.class);
+
+    /**
+     * Table property key that, when present on a table, excludes it from 
automatic statistics
+     * deletion regardless of the global retention setting.
+     */
+    public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY =
+            "statistics.auto.deletion.exclude";
+
+    private static final Lock LOCK = new ReentrantLock();
+
+    private Configuration conf;
+
+    @Override
+    public long runFrequency(TimeUnit unit) {
+        return MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit);
+    }
+
+    @Override
+    public void setConf(Configuration configuration) {
+        this.conf = new Configuration(configuration);
+        super.setConf(configuration);
+    }
+
+    @Override
+    public Configuration getConf() {
+        return conf;
+    }
+
+    @Override
+    public void run() {
+        LOG.debug("Auto statistics deletion started. Cleaning up 
table/partition column statistics over the retention period.");
+        long retentionMillis =
+                MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, 
TimeUnit.MILLISECONDS);
+        if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, 
MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) {
+            LOG.info("Statistics auto deletion is set to off currently.");
+            return;
+        }
+        if (!LOCK.tryLock()) {
+            return;
+        }
+        try {
+            long now = System.currentTimeMillis();
+            long lastAnalyzedThreshold = (now - retentionMillis) / 1000;
+            PersistenceManager pm = getPersistenceManager();
+            boolean committed = false;
+            openTransaction();
+            try {
+                try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) {
+                    deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold);
+                    deleteExpiredPartitionColStats(pm, msc, 
lastAnalyzedThreshold);
+                }
+                committed = commitTransaction();
+            } finally {
+                if (!committed) {
+                    rollbackTransaction();
+                }
+            }
+        } catch (Exception e) {
+            LOG.error("Error during statistics auto deletion", e);
+        } finally {
+            LOCK.unlock();
+        }
+    }
+
+    /**
+     * Deletes expired table-level column statistics from {@code 
TAB_COL_STATS}.
+     * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} 
property set are skipped.
+     *
+     * @param pm                   the JDO persistence manager to use for the 
query
+     * @param msc                  the metastore client used to issue delete 
requests
+     * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed 
below this value are expired
+     * @throws Exception if the JDO query or the delete request fails
+     */
+    private void deleteExpiredTableColStats(PersistenceManager pm, 
IMetaStoreClient msc,
+                                            long lastAnalyzedThreshold) throws 
Exception {
+        Query tblQuery = null;
+        try {
+            tblQuery = pm.newQuery(MTableColumnStatistics.class);
+            tblQuery.setFilter("lastAnalyzed < threshold");
+            tblQuery.declareParameters("long threshold");
+            // partitionName does not exist on MTableColumnStatistics; omitted 
here
+            tblQuery.setResult(
+                    "table.database.name, "
+                            + "table.tableName, "
+                            + "table.parameters.get(\"" + 
STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")");
+            @SuppressWarnings("unchecked")
+            List<Object[]> tblRows = (List<Object[]>) 
tblQuery.execute(lastAnalyzedThreshold);
+            for (Object[] row : tblRows) {
+                String dbName = (String) row[0];
+                String tblName = (String) row[1];
+                String excludeVal = (String) row[2];
+                if (excludeVal != null) {
+                    LOG.info("Skipping auto deletion of table stats for {}.{} 
due to exclude property.",
+                            dbName, tblName);
+                    continue;
+                }
+                DeleteColumnStatisticsRequest request = new 
DeleteColumnStatisticsRequest(dbName, tblName);
+                request.setEngine("hive");
+                request.setTableLevel(true);
+                msc.deleteColumnStatistics(request);
+            }
+        } finally {
+            if (tblQuery != null) {
+                tblQuery.closeAll();
+            }
+        }
+    }
+
+    /**
+     * Deletes expired partition-level column statistics from {@code 
PART_COL_STATS}.
+     * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} 
property set are skipped.
+     *
+     * @param pm                   the JDO persistence manager to use for the 
query
+     * @param msc                  the metastore client used to issue delete 
requests
+     * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed 
below this value are expired
+     * @throws Exception if the JDO query or the delete request fails
+     */
+    private void deleteExpiredPartitionColStats(PersistenceManager pm, 
IMetaStoreClient msc,
+                                                long lastAnalyzedThreshold) 
throws Exception {
+        Query partQuery = null;
+        try {
+            partQuery = pm.newQuery(MPartitionColumnStatistics.class);
+            partQuery.setFilter("lastAnalyzed < threshold");
+            partQuery.declareParameters("long threshold");
+            // project via partition navigation to reach partitionName and the 
table exclude property
+            partQuery.setResult(
+                    "partition.table.database.name, "
+                            + "partition.table.tableName, "
+                            + "partition.partitionName, "
+                            + "partition.table.parameters.get(\"" + 
STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")");
+            @SuppressWarnings("unchecked")
+            List<Object[]> partRows = (List<Object[]>) 
partQuery.execute(lastAnalyzedThreshold);
+            for (Object[] row : partRows) {
+                String dbName = (String) row[0];
+                String tblName = (String) row[1];
+                String partName = (String) row[2];
+                String excludeVal = (String) row[3];
+                if (excludeVal != null) {
+                    LOG.info("Skipping auto deletion of partition stats for 
{}.{} due to exclude property.",
+                            dbName, tblName);
+                    continue;
+                }
+                DeleteColumnStatisticsRequest request = new 
DeleteColumnStatisticsRequest(dbName, tblName);
+                request.setEngine("hive");
+                request.setTableLevel(false);
+                request.addToPart_names(partName);
+                msc.deleteColumnStatistics(request);
+            }

Review Comment:
   `deleteExpiredPartitionColStats` issues partition-level deletions without 
specifying `col_names`, which deletes stats for all columns in the partition 
(including non-expired ones) whenever any single column stat row is expired. 
Since the query returns one row per column, this can also trigger repeated 
deletes and duplicate listener events for the same (db,tbl,part). Include the 
column name in the projection and delete only the expired columns, 
grouped/deduped by (catalog, db, table, partition).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to