This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch pick/61511-branch-4.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit b0e44816ca5cab0db065eb78b5d0ef014872a5aa Author: Yongqiang YANG <[email protected]> AuthorDate: Fri Mar 20 02:41:43 2026 -0700 [improve](partition) Increase partition limit defaults to 20000 and add near-limit metrics (#61511) - Raise `max_dynamic_partition_num` default from 500 to 20000 and `max_auto_partition_num` from 2000 to 20000 to match modern production workloads - Add warning logs when partition counts exceed 80% of their configured limits, enabling proactive detection before hard failures - Add Prometheus counter metrics (`auto_partition_near_limit_count`, `dynamic_partition_near_limit_count`) for monitoring/alerting - [ ] Verify existing dynamic partition tests pass with new default (tests explicitly set config values, so unaffected) - [ ] Verify auto-partition limit check still errors correctly when exceeded - [ ] Verify warning logs appear when partition count is between 80%-100% of limit - [ ] Verify new metrics appear in `/metrics` Prometheus endpoint - [ ] Test Prometheus alert rule: `rate(doris_fe_auto_partition_near_limit_count[5m]) > 0` 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <[email protected]> Co-authored-by: dataroaring <[email protected]> --- .../src/main/java/org/apache/doris/common/Config.java | 7 +++---- .../apache/doris/common/util/DynamicPartitionUtil.java | 17 ++++++++++++++--- .../main/java/org/apache/doris/metric/MetricRepo.java | 14 ++++++++++++++ .../org/apache/doris/service/FrontendServiceImpl.java | 14 ++++++++++++-- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 41d47714a80..921796670c4 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1670,7 +1670,7 @@ public class Config extends ConfigBase { * The number is determined by "start" and "end" in the dynamic partition parameters. */ @ConfField(mutable = true, masterOnly = true) - public static int max_dynamic_partition_num = 500; + public static int max_dynamic_partition_num = 20000; /** * Used to limit the maximum number of partitions that can be created when creating multi partition, @@ -2966,9 +2966,8 @@ public class Config extends ConfigBase { @ConfField(mutable = true, masterOnly = true, description = { "对于自动分区表,防止用户意外创建大量分区,每个 OLAP 表允许的分区数量为`max_auto_partition_num`。默认 2000。", "For auto-partitioned tables to prevent users from accidentally creating a large number of partitions, " - + "the number of partitions allowed per OLAP table is `max_auto_partition_num`. Default 2000." - }) - public static int max_auto_partition_num = 2000; + + "the number of partitions allowed per OLAP table is `max_auto_partition_num`. Default 20000."}) + public static int max_auto_partition_num = 20000; @ConfField(mutable = true, masterOnly = true, description = { "Partition rebalance 方式下各个 BE 的 tablet 数最大差值,小于该值时,会诊断为已均衡", diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java index db12f6266ea..516d6942478 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java @@ -42,6 +42,7 @@ import org.apache.doris.common.ErrorReport; import org.apache.doris.common.FeConstants; import org.apache.doris.common.FeNameFormat; import org.apache.doris.common.UserException; +import org.apache.doris.metric.MetricRepo; import org.apache.doris.policy.StoragePolicy; import org.apache.doris.resource.Tag; import org.apache.doris.thrift.TStorageMedium; @@ -641,10 +642,20 @@ public class DynamicPartitionUtil { } expectCreatePartitionNum = (long) end - start; - if (!isReplay && hasEnd && (expectCreatePartitionNum > Config.max_dynamic_partition_num) + int dynamicPartitionLimit = Config.max_dynamic_partition_num; + if (!isReplay && hasEnd && Boolean.parseBoolean(analyzedProperties.getOrDefault(DynamicPartitionProperty.ENABLE, "true"))) { - throw new DdlException("Too many dynamic partitions: " - + expectCreatePartitionNum + ". Limit: " + Config.max_dynamic_partition_num); + if (expectCreatePartitionNum > dynamicPartitionLimit) { + throw new DdlException("Too many dynamic partitions: " + + expectCreatePartitionNum + ". Limit: " + dynamicPartitionLimit); + } else if (expectCreatePartitionNum > dynamicPartitionLimit * 8L / 10) { + LOG.warn("Dynamic partition count {} is approaching limit {} (>80%)." + + " Consider increasing max_dynamic_partition_num.", + expectCreatePartitionNum, dynamicPartitionLimit); + if (MetricRepo.isInit) { + MetricRepo.COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT.increase(1L); + } + } } if (properties.containsKey(DynamicPartitionProperty.START_DAY_OF_MONTH)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 6479575b64d..3ec8a97ae83 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -249,6 +249,10 @@ public final class MetricRepo { public static GaugeMetricImpl<Long> GAUGE_AVG_PARTITION_SIZE_BYTES; public static GaugeMetricImpl<Long> GAUGE_AVG_TABLET_SIZE_BYTES; + // Partition near-limit warnings + public static LongCounterMetric COUNTER_AUTO_PARTITION_NEAR_LIMIT; + public static LongCounterMetric COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT; + // Agent task public static LongCounterMetric COUNTER_AGENT_TASK_REQUEST_TOTAL; public static AutoMappedMetric<LongCounterMetric> COUNTER_AGENT_TASK_TOTAL; @@ -1002,6 +1006,16 @@ public final class MetricRepo { GAUGE_AVG_TABLET_SIZE_BYTES = new GaugeMetricImpl<>("avg_tablet_size_bytes", MetricUnit.BYTES, "", 0L); DORIS_METRIC_REGISTER.addMetrics(GAUGE_AVG_TABLET_SIZE_BYTES); + // Partition near-limit warning counters + COUNTER_AUTO_PARTITION_NEAR_LIMIT = new LongCounterMetric("auto_partition_near_limit_count", + MetricUnit.NOUNIT, + "number of times auto partition count exceeded 80% of max_auto_partition_num"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_AUTO_PARTITION_NEAR_LIMIT); + COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT = new LongCounterMetric("dynamic_partition_near_limit_count", + MetricUnit.NOUNIT, + "number of times dynamic partition count exceeded 80% of max_dynamic_partition_num"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_DYNAMIC_PARTITION_NEAR_LIMIT); + COUNTER_AGENT_TASK_REQUEST_TOTAL = new LongCounterMetric("agent_task_request_total", MetricUnit.NOUNIT, "total agent batch task request send to BE"); DORIS_METRIC_REGISTER.addMetrics(COUNTER_AGENT_TASK_REQUEST_TOTAL); diff --git a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java index 0bc4942ebf8..5a025524906 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java +++ b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java @@ -98,6 +98,7 @@ import org.apache.doris.load.routineload.RoutineLoadJob.JobState; import org.apache.doris.load.routineload.RoutineLoadManager; import org.apache.doris.master.MasterImpl; import org.apache.doris.meta.MetaContext; +import org.apache.doris.metric.MetricRepo; import org.apache.doris.mysql.privilege.AccessControllerManager; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.nereids.trees.plans.PlanNodeAndHash; @@ -3832,15 +3833,24 @@ public class FrontendServiceImpl implements FrontendService.Iface { // check partition's number limit. because partitions in addPartitionClauseMap may be duplicated with existing // partitions, which would lead to false positive. so we should check the partition number AFTER adding new // partitions using its ACTUAL NUMBER, rather than the sum of existing and requested partitions. - if (olapTable.getPartitionNum() > Config.max_auto_partition_num) { + int partitionNum = olapTable.getPartitionNum(); + int autoPartitionLimit = Config.max_auto_partition_num; + if (partitionNum > autoPartitionLimit) { String errorMessage = String.format( "partition numbers %d exceeded limit of variable max_auto_partition_num %d", - olapTable.getPartitionNum(), Config.max_auto_partition_num); + partitionNum, autoPartitionLimit); LOG.warn(errorMessage); errorStatus.setErrorMsgs(Lists.newArrayList(errorMessage)); result.setStatus(errorStatus); LOG.warn("send create partition error status: {}", result); return result; + } else if (partitionNum > autoPartitionLimit * 8 / 10) { + LOG.warn("Table {}.{} auto partition count {} is approaching limit {} (>80%)." + + " Consider increasing max_auto_partition_num.", + db.getFullName(), olapTable.getName(), partitionNum, autoPartitionLimit); + if (MetricRepo.isInit) { + MetricRepo.COUNTER_AUTO_PARTITION_NEAR_LIMIT.increase(1L); + } } // build partition & tablets --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
