This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 87d592e4bff [fix](metric) Change partition near-limit metrics from
counters to gauges (#61845)
87d592e4bff is described below
commit 87d592e4bff571ccbf57886d4856e96b1cf8f6ba
Author: Yongqiang YANG <[email protected]>
AuthorDate: Mon Mar 30 04:39:28 2026 -0700
[fix](metric) Change partition near-limit metrics from counters to gauges
(#61845)
## Summary
- Changed `auto_partition_near_limit_count` and
`dynamic_partition_near_limit_count` from `LongCounterMetric`
(monotonically increasing) to `GaugeMetricImpl<Long>` so they correctly
decrease when the near-limit condition resolves
- Moved metric computation from inline event-driven increments (in
`FrontendServiceImpl` and `DynamicPartitionUtil`) to `TabletStatMgr`'s
periodic all-table scan, which already iterates all tables and
partitions under read locks
- Metric names are preserved for monitoring compatibility; semantics
changed from "cumulative event count" to "current number of tables near
the limit"
## Test plan
- [ ] Verify `auto_partition_near_limit_count` increases when an
auto-partition table exceeds 80% of `max_auto_partition_num`
- [ ] Verify the gauge decreases back to 0 after dropping partitions
below the 80% threshold (within one `tablet_stat_update_interval_second`
cycle)
- [ ] Verify `dynamic_partition_near_limit_count` behaves the same for
dynamic partition tables
- [ ] Verify existing Prometheus/Grafana dashboards continue to scrape
the metric names without changes
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Opus 4.6 <[email protected]>
---
.../apache/doris/catalog/CloudTabletStatMgr.java | 22 ++++++++++++++++++++++
.../org/apache/doris/catalog/TabletStatMgr.java | 21 +++++++++++++++++++++
.../doris/common/util/DynamicPartitionUtil.java | 4 ----
.../java/org/apache/doris/metric/MetricRepo.java | 21 +++++++++++----------
.../apache/doris/service/FrontendServiceImpl.java | 4 ----
5 files changed, 54 insertions(+), 18 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
index 18da6784acf..7e23cfe3bbd 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java
@@ -302,6 +302,8 @@ public class CloudTabletStatMgr extends MasterDaemon {
long tabletCount = 0L;
long partitionCount = 0L;
long tableCount = 0L;
+ long autoPartitionNearLimitCount = 0L;
+ long dynamicPartitionNearLimitCount = 0L;
List<OlapTable.Statistics> newCloudTableStatsList = new ArrayList<>();
for (Long dbId : dbIds) {
Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId);
@@ -333,7 +335,24 @@ public class CloudTabletStatMgr extends MasterDaemon {
OlapTable.Statistics tableStats;
try {
List<Partition> allPartitions =
olapTable.getAllPartitions();
+ // Use getPartitionNum() (excludes temp partitions) for
limit check,
+ // consistent with how partition limits are enforced
elsewhere.
+ int nonTempPartitionNum = olapTable.getPartitionNum();
partitionCount += allPartitions.size();
+ // Check if this table's partition count is near the limit
(>80%)
+ if
(olapTable.getPartitionInfo().enableAutomaticPartition()) {
+ int limit = Config.max_auto_partition_num;
+ if (nonTempPartitionNum > limit * 8L / 10) {
+ autoPartitionNearLimitCount++;
+ }
+ }
+ if (olapTable.dynamicPartitionExists()
+ &&
olapTable.getTableProperty().getDynamicPartitionProperty().getEnable()) {
+ int limit = Config.max_dynamic_partition_num;
+ if (nonTempPartitionNum > limit * 8L / 10) {
+ dynamicPartitionNearLimitCount++;
+ }
+ }
for (Partition partition : allPartitions) {
long partitionDataSize = 0L;
for (MaterializedIndex index :
partition.getMaterializedIndices(IndexExtState.VISIBLE)) {
@@ -449,6 +468,9 @@ public class CloudTabletStatMgr extends MasterDaemon {
long avgTabletSize = totalTableSize / Math.max(1, tabletCount);
MetricRepo.GAUGE_AVG_TABLET_SIZE_BYTES.setValue(avgTabletSize);
+
MetricRepo.GAUGE_AUTO_PARTITION_NEAR_LIMIT.setValue(autoPartitionNearLimitCount);
+
MetricRepo.GAUGE_DYNAMIC_PARTITION_NEAR_LIMIT.setValue(dynamicPartitionNearLimitCount);
+
LOG.info("OlapTable num=" + tableCount
+ ", partition num=" + partitionCount + ", tablet num=" +
tabletCount
+ ", max tablet byte size=" + maxTabletSize.second
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java
index 37b198652be..a493aecc4ac 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java
@@ -128,6 +128,8 @@ public class TabletStatMgr extends MasterDaemon {
long tabletCount = 0L;
long partitionCount = 0L;
long tableCount = 0L;
+ long autoPartitionNearLimitCount = 0L;
+ long dynamicPartitionNearLimitCount = 0L;
List<Long> dbIds = Env.getCurrentInternalCatalog().getDbIds();
for (Long dbId : dbIds) {
Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId);
@@ -162,7 +164,24 @@ public class TabletStatMgr extends MasterDaemon {
}
try {
List<Partition> allPartitions =
olapTable.getAllPartitions();
+ // Use getPartitionNum() (excludes temp partitions) for
limit check,
+ // consistent with how partition limits are enforced
elsewhere.
+ int nonTempPartitionNum = olapTable.getPartitionNum();
partitionCount += allPartitions.size();
+ // Check if this table's partition count is near the limit
(>80%)
+ if
(olapTable.getPartitionInfo().enableAutomaticPartition()) {
+ int limit = Config.max_auto_partition_num;
+ if (nonTempPartitionNum > limit * 8L / 10) {
+ autoPartitionNearLimitCount++;
+ }
+ }
+ if (olapTable.dynamicPartitionExists()
+ &&
olapTable.getTableProperty().getDynamicPartitionProperty().getEnable()) {
+ int limit = Config.max_dynamic_partition_num;
+ if (nonTempPartitionNum > limit * 8L / 10) {
+ dynamicPartitionNearLimitCount++;
+ }
+ }
for (Partition partition : allPartitions) {
long partitionDataSize = 0L;
long version = partition.getVisibleVersion();
@@ -295,6 +314,8 @@ public class TabletStatMgr extends MasterDaemon {
// avoid ArithmeticException: / by zero
long avgTabletSize = totalTableSize / Math.max(1, tabletCount);
MetricRepo.GAUGE_AVG_TABLET_SIZE_BYTES.setValue(avgTabletSize);
+
MetricRepo.GAUGE_AUTO_PARTITION_NEAR_LIMIT.setValue(autoPartitionNearLimitCount);
+
MetricRepo.GAUGE_DYNAMIC_PARTITION_NEAR_LIMIT.setValue(dynamicPartitionNearLimitCount);
LOG.info("OlapTable num=" + tableCount
+ ", partition num=" + partitionCount + ", tablet num=" +
tabletCount
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java
index 516d6942478..09b4a9f18e8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java
@@ -42,7 +42,6 @@ import org.apache.doris.common.ErrorReport;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.FeNameFormat;
import org.apache.doris.common.UserException;
-import org.apache.doris.metric.MetricRepo;
import org.apache.doris.policy.StoragePolicy;
import org.apache.doris.resource.Tag;
import org.apache.doris.thrift.TStorageMedium;
@@ -652,9 +651,6 @@ public class DynamicPartitionUtil {
LOG.warn("Dynamic partition count {} is approaching limit {}
(>80%)."
+ " Consider increasing max_dynamic_partition_num.",
expectCreatePartitionNum, dynamicPartitionLimit);
- if (MetricRepo.isInit) {
-
MetricRepo.COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT.increase(1L);
- }
}
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
index c28c2aeb99f..d05c9f45ff1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
@@ -256,9 +256,9 @@ public final class MetricRepo {
public static GaugeMetricImpl<Long> GAUGE_AVG_PARTITION_SIZE_BYTES;
public static GaugeMetricImpl<Long> GAUGE_AVG_TABLET_SIZE_BYTES;
- // Partition near-limit warnings
- public static LongCounterMetric COUNTER_AUTO_PARTITION_NEAR_LIMIT;
- public static LongCounterMetric COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT;
+ // Partition near-limit warnings (gauges: current number of tables near
the partition limit)
+ public static GaugeMetricImpl<Long> GAUGE_AUTO_PARTITION_NEAR_LIMIT;
+ public static GaugeMetricImpl<Long> GAUGE_DYNAMIC_PARTITION_NEAR_LIMIT;
// Agent task
public static LongCounterMetric COUNTER_AGENT_TASK_REQUEST_TOTAL;
@@ -1044,15 +1044,16 @@ public final class MetricRepo {
GAUGE_AVG_TABLET_SIZE_BYTES = new
GaugeMetricImpl<>("avg_tablet_size_bytes", MetricUnit.BYTES, "", 0L);
DORIS_METRIC_REGISTER.addMetrics(GAUGE_AVG_TABLET_SIZE_BYTES);
- // Partition near-limit warning counters
- COUNTER_AUTO_PARTITION_NEAR_LIMIT = new
LongCounterMetric("auto_partition_near_limit_count",
+ // Partition near-limit warning gauges (updated by TabletStatMgr
periodic scan)
+ GAUGE_AUTO_PARTITION_NEAR_LIMIT = new
GaugeMetricImpl<>("auto_partition_near_limit_count",
MetricUnit.NOUNIT,
- "number of times auto partition count exceeded 80% of
max_auto_partition_num");
- DORIS_METRIC_REGISTER.addMetrics(COUNTER_AUTO_PARTITION_NEAR_LIMIT);
- COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT = new
LongCounterMetric("dynamic_partition_near_limit_count",
+ "number of auto partition tables where partition count
exceeded 80% of max_auto_partition_num", 0L);
+ DORIS_METRIC_REGISTER.addMetrics(GAUGE_AUTO_PARTITION_NEAR_LIMIT);
+ GAUGE_DYNAMIC_PARTITION_NEAR_LIMIT = new
GaugeMetricImpl<>("dynamic_partition_near_limit_count",
MetricUnit.NOUNIT,
- "number of times dynamic partition count exceeded 80% of
max_dynamic_partition_num");
- DORIS_METRIC_REGISTER.addMetrics(COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT);
+ "number of dynamic partition tables where partition count
exceeded 80% of max_dynamic_partition_num",
+ 0L);
+ DORIS_METRIC_REGISTER.addMetrics(GAUGE_DYNAMIC_PARTITION_NEAR_LIMIT);
COUNTER_AGENT_TASK_REQUEST_TOTAL = new
LongCounterMetric("agent_task_request_total", MetricUnit.NOUNIT,
"total agent batch task request send to BE");
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
index 44410af0163..6d5cbaab063 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java
@@ -96,7 +96,6 @@ import
org.apache.doris.load.routineload.RoutineLoadJob.JobState;
import org.apache.doris.load.routineload.RoutineLoadManager;
import org.apache.doris.master.MasterImpl;
import org.apache.doris.meta.MetaContext;
-import org.apache.doris.metric.MetricRepo;
import org.apache.doris.mysql.privilege.AccessControllerManager;
import org.apache.doris.mysql.privilege.PrivPredicate;
import org.apache.doris.nereids.trees.plans.PlanNodeAndHash;
@@ -4407,9 +4406,6 @@ public class FrontendServiceImpl implements
FrontendService.Iface {
LOG.warn("Table {}.{} auto partition count {} is approaching limit
{} (>80%)."
+ " Consider increasing max_auto_partition_num.",
db.getFullName(), olapTable.getName(), partitionNum,
autoPartitionLimit);
- if (MetricRepo.isInit) {
- MetricRepo.COUNTER_AUTO_PARTITION_NEAR_LIMIT.increase(1L);
- }
}
// build partition & tablets
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]