This is an automated email from the ASF dual-hosted git repository.
shreemaan-abhishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new d0929e92d feat(prometheus): support disabling labels via plugin
metadata to reduce cardinality (#13202)
d0929e92d is described below
commit d0929e92d6fa3fe344497037d231d469842446cc
Author: Mohammad Izzraff Janius
<[email protected]>
AuthorDate: Mon Jun 15 14:47:05 2026 +0900
feat(prometheus): support disabling labels via plugin metadata to reduce
cardinality (#13202)
---
apisix/plugins/prometheus.lua | 53 +++++++-
apisix/plugins/prometheus/exporter.lua | 225 +++++++++++++++++++++++----------
docs/en/latest/plugins/prometheus.md | 72 +++++++++++
docs/zh/latest/plugins/prometheus.md | 72 +++++++++++
t/plugin/prometheus-label-filter.t | 172 +++++++++++++++++++++++++
5 files changed, 529 insertions(+), 65 deletions(-)
diff --git a/apisix/plugins/prometheus.lua b/apisix/plugins/prometheus.lua
index 592c12ab5..b1f3c58b3 100644
--- a/apisix/plugins/prometheus.lua
+++ b/apisix/plugins/prometheus.lua
@@ -16,6 +16,8 @@
--
local core = require("apisix.core")
local exporter = require("apisix.plugins.prometheus.exporter")
+local pairs = pairs
+local ipairs = ipairs
local plugin_name = "prometheus"
local schema = {
@@ -29,6 +31,50 @@ local schema = {
}
+-- Labels that define a metric's identity cannot be disabled: e.g. collapsing
+-- `type` would merge request/upstream/apisix latencies into one histogram
series.
+local structural_labels = {
+ http_status = {code = true},
+ http_latency = {type = true},
+ bandwidth = {type = true},
+ llm_latency = {type = true},
+}
+
+
+local function build_disabled_labels_properties()
+ local properties = {}
+ for metric_name, metric_labels in pairs(exporter.metric_label_map) do
+ local enum = {}
+ local structural = structural_labels[metric_name]
+ for _, label in ipairs(metric_labels) do
+ if not (structural and structural[label]) then
+ core.table.insert(enum, label)
+ end
+ end
+ properties[metric_name] = {
+ type = "array",
+ items = {
+ type = "string",
+ enum = enum,
+ },
+ }
+ end
+ return properties
+end
+
+
+local metadata_schema = {
+ type = "object",
+ properties = {
+ disabled_labels = {
+ type = "object",
+ properties = build_disabled_labels_properties(),
+ additionalProperties = false,
+ },
+ },
+}
+
+
local _M = {
version = 0.2,
priority = 500,
@@ -36,11 +82,16 @@ local _M = {
log = exporter.http_log,
destroy = exporter.destroy,
schema = schema,
+ metadata_schema = metadata_schema,
run_policy = "prefer_route",
}
-function _M.check_schema(conf)
+function _M.check_schema(conf, schema_type)
+ if schema_type == core.schema.TYPE_METADATA then
+ return core.schema.check(metadata_schema, conf)
+ end
+
local ok, err = core.schema.check(schema, conf)
if not ok then
return false, err
diff --git a/apisix/plugins/prometheus/exporter.lua
b/apisix/plugins/prometheus/exporter.lua
index cc4673839..90c5d810d 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -77,7 +77,6 @@ local exporter_timer_running = false
local exporter_timer_created = false
-
local function gen_arr(...)
clear_tab(inner_tab_arr)
for i = 1, select('#', ...) do
@@ -113,7 +112,99 @@ local function extra_labels(name, ctx)
end
-local _M = {}
+local lrucache = core.lrucache.new({
+ type = "plugin",
+})
+
+
+local metric_label_map = {
+ http_status = {"code", "route", "matched_uri", "matched_host", "service",
"consumer", "node",
+ "request_type", "request_llm_model", "llm_model", "response_source"},
+ http_latency = {"type", "route", "service", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ bandwidth = {"type", "route", "service", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ llm_latency = {"type", "route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ llm_prompt_tokens = {"route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ llm_completion_tokens = {"route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ llm_active_connections = {"route", "route_id", "matched_uri",
"matched_host",
+ "service", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ llm_prompt_tokens_dist = {"route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+ llm_completion_tokens_dist = {"route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model"},
+}
+
+
+local function append_tables(...)
+ local res = {}
+ for _, tab in ipairs({...}) do
+ for _, v in ipairs(tab) do
+ core.table.insert(res, v)
+ end
+ end
+ return res
+end
+
+
+-- shared and read-only: avoids allocating in the log hot path
+local empty_disabled_map = {}
+
+
+local function build_disabled_label_metric_map(disabled_labels)
+ local disabled_label_metric_map = {}
+ for metric_name, labels in pairs(disabled_labels) do
+ disabled_label_metric_map[metric_name] = {}
+ for _, label in ipairs(labels) do
+ disabled_label_metric_map[metric_name][label] = true
+ end
+ end
+ return disabled_label_metric_map
+end
+
+
+-- Returns metric_name -> {label = true}, rebuilt only when the metadata
changes.
+local function get_disabled_label_metric_map()
+ local metadata = plugin.plugin_metadata(plugin_name)
+ if not (metadata and metadata.value and metadata.value.disabled_labels
+ and metadata.modifiedIndex) then
+ return empty_disabled_map
+ end
+
+ return lrucache(plugin_name, metadata.modifiedIndex,
+ build_disabled_label_metric_map,
metadata.value.disabled_labels)
+end
+
+
+local function get_enabled_label_values_for_metric(metric_name,
disabled_label_metric_map, ...)
+ local label_values = gen_arr(...)
+
+ -- fast path: nothing disabled for this metric
+ local disabled_labels = disabled_label_metric_map[metric_name]
+ if not disabled_labels then
+ return label_values
+ end
+
+ -- iterate the ordered label list rather than `label_values`: a nil value
+ -- must not end the scan early, and extra_labels after the built-ins stay
untouched
+ local metric_labels = metric_label_map[metric_name]
+ for i = 1, #metric_labels do
+ if disabled_labels[metric_labels[i]] then
+ label_values[i] = ""
+ end
+ end
+
+ return label_values
+end
+
+
+local _M = {
+ metric_label_map = metric_label_map,
+}
local function init_stream_metrics()
@@ -211,10 +302,7 @@ function _M.http_init(prometheus_enabled_in_stream)
-- no consumer in request.
metrics.status = prometheus:counter("http_status",
"HTTP status codes per service in APISIX",
- {"code", "route", "matched_uri", "matched_host", "service",
"consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- "response_source",
- unpack(extra_labels("http_status"))},
+ append_tables(metric_label_map.http_status,
extra_labels("http_status")),
status_metrics_exptime)
local buckets = DEFAULT_BUCKETS
@@ -223,54 +311,45 @@ function _M.http_init(prometheus_enabled_in_stream)
end
metrics.latency = prometheus:histogram("http_latency",
- "HTTP request latency in milliseconds per service in APISIX",
- {"type", "route", "service", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("http_latency"))},
- buckets, latency_metrics_exptime)
+ "HTTP request latency in milliseconds per service in APISIX",
+ append_tables(metric_label_map.http_latency,
extra_labels("http_latency")),
+ buckets, latency_metrics_exptime)
metrics.bandwidth = prometheus:counter("bandwidth",
"Total bandwidth in bytes consumed per service in APISIX",
- {"type", "route", "service", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("bandwidth"))},
+ append_tables(metric_label_map.bandwidth,
extra_labels("bandwidth")),
bandwidth_metrics_exptime)
local llm_latency_buckets = DEFAULT_BUCKETS
if attr and attr.llm_latency_buckets then
llm_latency_buckets = attr.llm_latency_buckets
end
+
-- The "type" label distinguishes latency kinds, mirroring
apisix_http_latency:
-- total - full response latency (both ai_chat and ai_stream)
-- ttft - time to first token (ai_stream only)
metrics.llm_latency = prometheus:histogram("llm_latency",
- "LLM request latency in milliseconds",
- {"type", "route_id", "service_id", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("llm_latency"))},
- llm_latency_buckets,
- llm_latency_exptime)
+ "LLM request latency in milliseconds",
+ append_tables(metric_label_map.llm_latency,
extra_labels("llm_latency")),
+ llm_latency_buckets,
+ llm_latency_exptime)
metrics.llm_prompt_tokens = prometheus:counter("llm_prompt_tokens",
"LLM service consumed prompt tokens",
- {"route_id", "service_id", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("llm_prompt_tokens"))},
+ append_tables(metric_label_map.llm_prompt_tokens,
+ extra_labels("llm_prompt_tokens")),
llm_prompt_tokens_exptime)
metrics.llm_completion_tokens = prometheus:counter("llm_completion_tokens",
"LLM service consumed completion tokens",
- {"route_id", "service_id", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("llm_completion_tokens"))},
+ append_tables(metric_label_map.llm_completion_tokens,
+ extra_labels("llm_completion_tokens")),
llm_completion_tokens_exptime)
metrics.llm_active_connections = prometheus:gauge("llm_active_connections",
"Number of active connections to LLM service",
- {"route", "route_id", "matched_uri", "matched_host",
- "service", "service_id", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("llm_active_connections"))},
+ append_tables(metric_label_map.llm_active_connections,
+ extra_labels("llm_active_connections")),
llm_active_connections_exptime)
local llm_prompt_tokens_buckets = DEFAULT_TOKEN_BUCKETS
@@ -279,9 +358,8 @@ function _M.http_init(prometheus_enabled_in_stream)
end
metrics.llm_prompt_tokens_dist =
prometheus:histogram("llm_prompt_tokens_dist",
"LLM prompt tokens distribution per request",
- {"route_id", "service_id", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("llm_prompt_tokens_dist"))},
+ append_tables(metric_label_map.llm_prompt_tokens_dist,
+ extra_labels("llm_prompt_tokens_dist")),
llm_prompt_tokens_buckets,
llm_prompt_tokens_dist_exptime)
@@ -291,9 +369,8 @@ function _M.http_init(prometheus_enabled_in_stream)
end
metrics.llm_completion_tokens_dist =
prometheus:histogram("llm_completion_tokens_dist",
"LLM completion tokens distribution per request",
- {"route_id", "service_id", "consumer", "node",
- "request_type", "request_llm_model", "llm_model",
- unpack(extra_labels("llm_completion_tokens_dist"))},
+ append_tables(metric_label_map.llm_completion_tokens_dist,
+ extra_labels("llm_completion_tokens_dist")),
llm_completion_tokens_buckets,
llm_completion_tokens_dist_exptime)
@@ -329,6 +406,7 @@ end
function _M.http_log(conf, ctx)
local vars = ctx.var
+ local disabled_label_metric_map = get_disabled_label_metric_map()
local route_id = ""
local balancer_ip = ctx.balancer_ip or ""
@@ -358,43 +436,50 @@ function _M.http_log(conf, ctx)
local response_source = core.response.get_response_source(ctx)
metrics.status:inc(1,
- gen_arr(vars.status, route_id, matched_uri, matched_host,
- service_id, consumer_name, balancer_ip,
- vars.request_type, vars.request_llm_model, vars.llm_model,
- response_source,
- unpack(extra_labels("http_status", ctx))))
+ get_enabled_label_values_for_metric("http_status",
disabled_label_metric_map,
+ vars.status, route_id, matched_uri, matched_host,
+ service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ response_source,
+ unpack(extra_labels("http_status", ctx))))
local latency, upstream_latency, apisix_latency = latency_details(ctx)
+
local latency_extra_label_values = extra_labels("http_latency", ctx)
metrics.latency:observe(latency,
- gen_arr("request", route_id, service_id, consumer_name, balancer_ip,
- vars.request_type, vars.request_llm_model, vars.llm_model,
- unpack(latency_extra_label_values)))
+ get_enabled_label_values_for_metric("http_latency",
disabled_label_metric_map,
+ "request", route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(latency_extra_label_values)))
if upstream_latency then
metrics.latency:observe(upstream_latency,
- gen_arr("upstream", route_id, service_id, consumer_name,
balancer_ip,
- vars.request_type, vars.request_llm_model, vars.llm_model,
- unpack(latency_extra_label_values)))
+ get_enabled_label_values_for_metric("http_latency",
disabled_label_metric_map,
+ "upstream", route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(latency_extra_label_values)))
end
metrics.latency:observe(apisix_latency,
- gen_arr("apisix", route_id, service_id, consumer_name, balancer_ip,
- vars.request_type, vars.request_llm_model, vars.llm_model,
- unpack(latency_extra_label_values)))
+ get_enabled_label_values_for_metric("http_latency",
disabled_label_metric_map,
+ "apisix", route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(latency_extra_label_values)))
local bandwidth_extra_label_values = extra_labels("bandwidth", ctx)
metrics.bandwidth:inc(vars.request_length,
- gen_arr("ingress", route_id, service_id, consumer_name, balancer_ip,
- vars.request_type, vars.request_llm_model, vars.llm_model,
- unpack(bandwidth_extra_label_values)))
+ get_enabled_label_values_for_metric("bandwidth",
disabled_label_metric_map,
+ "ingress", route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(bandwidth_extra_label_values)))
metrics.bandwidth:inc(vars.bytes_sent,
- gen_arr("egress", route_id, service_id, consumer_name, balancer_ip,
- vars.request_type, vars.request_llm_model, vars.llm_model,
- unpack(bandwidth_extra_label_values)))
+ get_enabled_label_values_for_metric("bandwidth",
disabled_label_metric_map,
+ "egress", route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(bandwidth_extra_label_values)))
if vars.request_type == "ai_stream" or vars.request_type == "ai_chat" then
local llm_time_to_first_token = vars.llm_time_to_first_token
@@ -404,35 +489,44 @@ function _M.http_log(conf, ctx)
-- TTFT, so use apisix_upstream_response_time (refreshed on every
-- chunk) to capture the time until the whole response completes.
metrics.llm_latency:observe(tonumber(vars.apisix_upstream_response_time),
- gen_arr("total", route_id, service_id, consumer_name,
balancer_ip,
+ get_enabled_label_values_for_metric("llm_latency",
disabled_label_metric_map,
+ "total", route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_latency", ctx))))
-- type="ttft": time to first token, only streaming exposes a real
one.
if vars.request_type == "ai_stream" then
metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
- gen_arr("ttft", route_id, service_id, consumer_name,
balancer_ip,
+ get_enabled_label_values_for_metric("llm_latency",
disabled_label_metric_map,
+ "ttft", route_id, service_id, consumer_name,
balancer_ip,
vars.request_type, vars.request_llm_model,
vars.llm_model,
unpack(extra_labels("llm_latency", ctx))))
end
end
+
metrics.llm_prompt_tokens:inc(tonumber(vars.llm_prompt_tokens),
- gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ get_enabled_label_values_for_metric("llm_prompt_tokens",
disabled_label_metric_map,
+ route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_prompt_tokens", ctx))))
metrics.llm_prompt_tokens_dist:observe(tonumber(vars.llm_prompt_tokens),
- gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ get_enabled_label_values_for_metric("llm_prompt_tokens_dist",
+ disabled_label_metric_map,
+ route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_prompt_tokens_dist", ctx))))
metrics.llm_completion_tokens:inc(tonumber(vars.llm_completion_tokens),
- gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ get_enabled_label_values_for_metric("llm_completion_tokens",
disabled_label_metric_map,
+ route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_completion_tokens", ctx))))
metrics.llm_completion_tokens_dist:observe(tonumber(vars.llm_completion_tokens),
- gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ get_enabled_label_values_for_metric("llm_completion_tokens_dist",
+ disabled_label_metric_map,
+ route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_completion_tokens_dist", ctx))))
end
@@ -830,12 +924,15 @@ local function inc_llm_active_connections(ctx, value)
matched_host = ctx.curr_req_matched._host or ""
end
+ local disabled_label_metric_map = get_disabled_label_metric_map()
+
metrics.llm_active_connections:inc(
value,
- gen_arr(route_name, route_id, matched_uri,
+ get_enabled_label_values_for_metric("llm_active_connections",
disabled_label_metric_map,
+ route_name, route_id, matched_uri,
matched_host, service_name, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
- unpack(extra_labels("llm_active_connections", ctx)))
+ unpack(extra_labels("llm_active_connections", ctx)))
)
end
diff --git a/docs/en/latest/plugins/prometheus.md
b/docs/en/latest/plugins/prometheus.md
index 2f92041fe..194d32042 100644
--- a/docs/en/latest/plugins/prometheus.md
+++ b/docs/en/latest/plugins/prometheus.md
@@ -94,6 +94,18 @@ Reload APISIX for changes to take effect.
| ------------- | ------- | -------- | ------- | ------------ |
------------------------------------------ |
| prefer_name | boolean | False | false | | If true, export
Route/Service name instead of their ID in Prometheus metrics. |
+## Metadata
+
+You can configure the Plugin through its [Plugin
Metadata](../terminology/plugin-metadata.md), which is set dynamically through
the Admin API and takes effect at runtime without a restart.
+
+| Name | Type | Required | Description
|
+| --------------- | ------ | -------- |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
+| disabled_labels | object | False | Per-metric map of built-in label names
whose values are collapsed to an empty string `""` to reduce metric
cardinality. Keyed by metric name: `http_status`, `http_latency`, `bandwidth`,
`llm_latency`, `llm_prompt_tokens`, `llm_completion_tokens`,
`llm_active_connections`, `llm_prompt_tokens_dist`,
`llm_completion_tokens_dist`. Structural labels that define a metric's identity
(`code` on `http_status`, `type` on `http_latency`, `bandwidth` and `llm_ [...]
+
+Collapsing a label's value to `""` keeps the label registered in the metric
schema, so existing dashboards, `absent()` alerts, and recording rules keep
working — only the high-cardinality time series that differ solely by those
labels are collapsed into one. This is useful in dynamic environments such as
Kubernetes autoscaling, where the upstream node IP (`node` label) churns
rapidly and would otherwise overflow the `prometheus-metrics` shared dict.
+
+See [Reduce Metric Cardinality by Disabling
Labels](#reduce-metric-cardinality-by-disabling-labels) for an example.
+
## Metrics
There are different types of metrics in Prometheus. To understand their
differences, see [metrics
types](https://prometheus.io/docs/concepts/metric_types/).
@@ -509,6 +521,66 @@ You should see an output similar to the following:
apisix_http_status{code="200",route="1",matched_uri="/get",matched_host="",service="",consumer="",node="54.237.103.220",upstream_addr="54.237.103.220:80",route_name="extra-label"}
1
```
+### Reduce Metric Cardinality by Disabling Labels
+
+The following example demonstrates how to reduce metric cardinality by
collapsing the values of selected built-in labels to an empty string `""` using
the [Plugin Metadata](../terminology/plugin-metadata.md). This is useful in
dynamic environments such as Kubernetes autoscaling, where the upstream node IP
(`node` label) churns rapidly and would otherwise overflow the
`prometheus-metrics` shared dict.
+
+Collapsing a label's value keeps the label registered in the metric schema, so
existing dashboards, `absent()` alerts, and recording rules keep working.
Structural labels that define a metric's identity (`code` on `http_status`,
`type` on `http_latency`, `bandwidth` and `llm_latency`) cannot be disabled.
+
+Create a Route with the `prometheus` Plugin:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "id": "prometheus-route",
+ "uri": "/get",
+ "plugins": {
+ "prometheus": {}
+ },
+ "upstream": {
+ "nodes": {
+ "httpbin.org:80": 1
+ }
+ }
+ }'
+```
+
+Configure the Plugin Metadata to collapse the `node` and `consumer` labels on
`apisix_http_status` and the `node` label on `apisix_http_latency`:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/plugin_metadata/prometheus" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "disabled_labels": {
+ "http_status": ["node", "consumer"],
+ "http_latency": ["node"]
+ }
+ }'
+```
+
+Send a request to the Route to verify:
+
+```shell
+curl -i "http://127.0.0.1:9080/get"
+```
+
+You should see an `HTTP/1.1 200 OK` response.
+
+Send a request to the APISIX Prometheus metrics endpoint:
+
+```shell
+curl "http://127.0.0.1:9091/apisix/prometheus/metrics"
+```
+
+You should see that `node` and `consumer` are collapsed to empty strings on
`apisix_http_status`, while metrics that are not listed (such as
`apisix_bandwidth`) keep all their label values:
+
+```text
+# HELP apisix_http_status HTTP status codes per service in APISIX
+# TYPE apisix_http_status counter
+apisix_http_status{code="200",route="prometheus-route",matched_uri="/get",matched_host="",service="",consumer="",node="",request_type="traditional_http",request_llm_model="",llm_model="",response_source="upstream"}
1
+```
+
### Monitor TCP/UDP Traffic with Prometheus
The following example demonstrates how to collect TCP/UDP traffic metrics in
APISIX.
diff --git a/docs/zh/latest/plugins/prometheus.md
b/docs/zh/latest/plugins/prometheus.md
index ff3963a6e..d7eed001a 100644
--- a/docs/zh/latest/plugins/prometheus.md
+++ b/docs/zh/latest/plugins/prometheus.md
@@ -94,6 +94,18 @@ plugin_attr:
| ------------ | --------| ------ | ------ |
----------------------------------------------------- |
|prefer_name | boolean | 否 | false | 当设置为 `true` 时,则在`prometheus`
指标中导出路由/服务名称而非它们的 `id`。 |
+## 元数据
+
+你可以通过插件的[元数据(Plugin Metadata)](../terminology/plugin-metadata.md)进行配置。元数据通过
Admin API 动态设置,无需重启即可在运行时生效。
+
+| 名称 | 类型 | 必选项 | 描述
|
+| --------------- | ------ | ------ |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
+| disabled_labels | object | 否 | 按指标配置的内置标签列表,列出的标签其值会被设置为空字符串 `""`
以降低指标基数。以指标名称作为键:`http_status`、`http_latency`、`bandwidth`、`llm_latency`、`llm_prompt_tokens`、`llm_completion_tokens`、`llm_active_connections`、`llm_prompt_tokens_dist`、`llm_completion_tokens_dist`。定义指标本身含义的结构性标签(`http_status`
的 `code`、`http_latency`、`bandwidth` 与 `llm_latency` 的 `type`)不可被禁用。 |
+
+将标签值设置为 `""` 时,标签仍保留在指标 schema 中,因此现有的仪表盘、`absent()` 告警和 recording rule
都不受影响——只是将仅因这些标签而不同的高基数时间序列合并为一条。这在 Kubernetes 弹性伸缩等动态环境中尤其有用:此时上游节点 IP(`node`
标签)频繁变化,否则会很快撑爆 `prometheus-metrics` 共享字典。
+
+示例请参见[通过禁用标签降低指标基数](#通过禁用标签降低指标基数)。
+
## 指标
Prometheus
中有不同类型的指标。要了解它们之间的区别,请参见[指标类型](https://prometheus.io/docs/concepts/metric_types/)。
@@ -508,6 +520,66 @@ curl "http://127.0.0.1:9091/apisix/prometheus/metrics"
apisix_http_status{code="200",route="1",matched_uri="/get",matched_host="",service="",consumer="",node="54.237.103.220",upstream_addr="54.237.103.220:80",route_name="extra-label"}
1
```
+### 通过禁用标签降低指标基数
+
+以下示例演示如何通过[插件元数据(Plugin
Metadata)](../terminology/plugin-metadata.md)将选定内置标签的值折叠为空字符串 `""`,从而降低指标基数。这在
Kubernetes 弹性伸缩等动态环境中尤其有用:此时上游节点 IP(`node` 标签)频繁变化,否则会很快撑爆 `prometheus-metrics`
共享字典。
+
+将标签值折叠后,标签仍保留在指标 schema 中,因此现有的仪表盘、`absent()` 告警和 recording rule
都不受影响。定义指标本身含义的结构性标签(`http_status` 的 `code`、`http_latency`、`bandwidth` 与
`llm_latency` 的 `type`)不可被禁用。
+
+创建一个启用 `prometheus` 插件的路由:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "id": "prometheus-route",
+ "uri": "/get",
+ "plugins": {
+ "prometheus": {}
+ },
+ "upstream": {
+ "nodes": {
+ "httpbin.org:80": 1
+ }
+ }
+ }'
+```
+
+配置插件元数据,将 `apisix_http_status` 的 `node` 和 `consumer` 标签、以及
`apisix_http_latency` 的 `node` 标签的值折叠:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/plugin_metadata/prometheus" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "disabled_labels": {
+ "http_status": ["node", "consumer"],
+ "http_latency": ["node"]
+ }
+ }'
+```
+
+向路由发送请求以进行验证:
+
+```shell
+curl -i "http://127.0.0.1:9080/get"
+```
+
+你应该看到 `HTTP/1.1 200 OK` 的响应。
+
+向 APISIX Prometheus 指标端点发送请求:
+
+```shell
+curl "http://127.0.0.1:9091/apisix/prometheus/metrics"
+```
+
+你应该看到 `apisix_http_status` 中的 `node` 和 `consumer` 被折叠为空字符串,而未列出的指标(如
`apisix_bandwidth`)仍保留其所有标签值:
+
+```text
+# HELP apisix_http_status APISIX 中每个服务的 HTTP 状态代码
+# TYPE apisix_http_status counter
+apisix_http_status{code="200",route="prometheus-route",matched_uri="/get",matched_host="",service="",consumer="",node="",request_type="traditional_http",request_llm_model="",llm_model="",response_source="upstream"}
1
+```
+
### 使用 Prometheus 监控 TCP/UDP 流量
以下示例演示如何在 APISIX 中收集 TCP/UDP 流量指标。
diff --git a/t/plugin/prometheus-label-filter.t
b/t/plugin/prometheus-label-filter.t
new file mode 100644
index 000000000..fe1324c1c
--- /dev/null
+++ b/t/plugin/prometheus-label-filter.t
@@ -0,0 +1,172 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+BEGIN {
+ if ($ENV{TEST_NGINX_CHECK_LEAK}) {
+ $SkipReason = "unavailable for the hup tests";
+
+ } else {
+ $ENV{TEST_NGINX_USE_HUP} = 1;
+ undef $ENV{TEST_NGINX_USE_STAP};
+ }
+}
+
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+ my ($block) = @_;
+
+ if (!defined $block->request) {
+ $block->set_value("request", "GET /t");
+ }
+
+ if (!defined $block->yaml_config) {
+ $block->set_value("yaml_config", <<'EOF');
+plugin_attr:
+ prometheus:
+ refresh_interval: 0.1
+EOF
+ }
+});
+
+run_tests;
+
+__DATA__
+
+=== TEST 1: set up routes and disable labels per-metric via plugin metadata
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+
+ local code = t('/apisix/admin/routes/metrics',
+ ngx.HTTP_PUT,
+ [[{
+ "plugins": {"public-api": {}},
+ "uri": "/apisix/prometheus/metrics"
+ }]])
+ if code >= 300 then
+ ngx.status = code
+ ngx.say("failed to create metrics route")
+ return
+ end
+
+ code = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "plugins": {"prometheus": {}},
+ "upstream": {
+ "nodes": {"127.0.0.1:1980": 1},
+ "type": "roundrobin"
+ },
+ "uri": "/hello"
+ }]])
+ if code >= 300 then
+ ngx.status = code
+ ngx.say("failed to create route 1")
+ return
+ end
+
+ local code, body = t('/apisix/admin/plugin_metadata/prometheus',
+ ngx.HTTP_PUT,
+ [[{
+ "disabled_labels": {
+ "http_status": ["route", "node"],
+ "bandwidth": ["node"]
+ }
+ }]])
+ if code >= 300 then
+ ngx.status = code
+ ngx.say(body)
+ return
+ end
+
+ -- give the data plane time to sync the routes and plugin metadata
+ ngx.sleep(1.5)
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 2: warm up metrics with client requests
+--- pipelined_requests eval
+["GET /hello", "GET /hello", "GET /hello"]
+--- error_code eval
+[200, 200, 200]
+
+
+
+=== TEST 3: http_status has disabled labels (route, node) collapsed to ""
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_http_status\{code="\d+",route="",matched_uri="[^"]*",matched_host="[^"]*",service="[^"]*",consumer="[^"]*",node="",request_type="[^"]*",request_llm_model="[^"]*",llm_model="[^"]*",response_source="[^"]*"\}
\d+/
+
+
+
+=== TEST 4: per-metric scoping - http_latency keeps route and node populated
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_http_latency_count\{type="request",route="1",service="[^"]*",consumer="[^"]*",node="127.0.0.1",request_type="[^"]*",request_llm_model="[^"]*",llm_model="[^"]*"\}
\d+/
+
+
+
+=== TEST 5: per-metric scoping - bandwidth collapses node but keeps route
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_bandwidth\{type="(?:ingress|egress)",route="1",service="[^"]*",consumer="[^"]*",node="",request_type="[^"]*",request_llm_model="[^"]*",llm_model="[^"]*"\}
\d+/
+
+
+
+=== TEST 6: reject disabling a structural label (`code` on http_status)
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/plugin_metadata/prometheus',
+ ngx.HTTP_PUT,
+ [[{"disabled_labels": {"http_status": ["code"]}}]])
+ ngx.say(body)
+ }
+ }
+--- response_body eval
+qr/failed to validate item 1/
+
+
+
+=== TEST 7: reject an unknown metric key (additionalProperties = false)
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/plugin_metadata/prometheus',
+ ngx.HTTP_PUT,
+ [[{"disabled_labels": {"unknown_metric": ["node"]}}]])
+ ngx.say(body)
+ }
+ }
+--- response_body eval
+qr/additional properties forbidden/