(apisix) branch master updated: feat(prometheus): support disabling labels via plugin metadata to reduce cardinality (#13202)

shreemaanabhishek Sun, 14 Jun 2026 22:47:20 -0700

This is an automated email from the ASF dual-hosted git repository.

shreemaan-abhishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git



The following commit(s) were added to refs/heads/master by this push:
     new d0929e92d feat(prometheus): support disabling labels via plugin 
metadata to reduce cardinality (#13202)
d0929e92d is described below

commit d0929e92d6fa3fe344497037d231d469842446cc
Author: Mohammad Izzraff Janius 
<[email protected]>
AuthorDate: Mon Jun 15 14:47:05 2026 +0900

    feat(prometheus): support disabling labels via plugin metadata to reduce 
cardinality (#13202)
---
 apisix/plugins/prometheus.lua          |  53 +++++++-
 apisix/plugins/prometheus/exporter.lua | 225 +++++++++++++++++++++++----------
 docs/en/latest/plugins/prometheus.md   |  72 +++++++++++
 docs/zh/latest/plugins/prometheus.md   |  72 +++++++++++
 t/plugin/prometheus-label-filter.t     | 172 +++++++++++++++++++++++++
 5 files changed, 529 insertions(+), 65 deletions(-)

diff --git a/apisix/plugins/prometheus.lua b/apisix/plugins/prometheus.lua
index 592c12ab5..b1f3c58b3 100644
--- a/apisix/plugins/prometheus.lua
+++ b/apisix/plugins/prometheus.lua
@@ -16,6 +16,8 @@
 --
 local core = require("apisix.core")
 local exporter = require("apisix.plugins.prometheus.exporter")
+local pairs = pairs
+local ipairs = ipairs
 
 local plugin_name = "prometheus"
 local schema = {
@@ -29,6 +31,50 @@ local schema = {
 }
 
 
+-- Labels that define a metric's identity cannot be disabled: e.g. collapsing
+-- `type` would merge request/upstream/apisix latencies into one histogram 
series.
+local structural_labels = {
+    http_status = {code = true},
+    http_latency = {type = true},
+    bandwidth = {type = true},
+    llm_latency = {type = true},
+}
+
+
+local function build_disabled_labels_properties()
+    local properties = {}
+    for metric_name, metric_labels in pairs(exporter.metric_label_map) do
+        local enum = {}
+        local structural = structural_labels[metric_name]
+        for _, label in ipairs(metric_labels) do
+            if not (structural and structural[label]) then
+                core.table.insert(enum, label)
+            end
+        end
+        properties[metric_name] = {
+            type = "array",
+            items = {
+                type = "string",
+                enum = enum,
+            },
+        }
+    end
+    return properties
+end
+
+
+local metadata_schema = {
+    type = "object",
+    properties = {
+        disabled_labels = {
+            type = "object",
+            properties = build_disabled_labels_properties(),
+            additionalProperties = false,
+        },
+    },
+}
+
+
 local _M = {
     version = 0.2,
     priority = 500,
@@ -36,11 +82,16 @@ local _M = {
     log  = exporter.http_log,
     destroy = exporter.destroy,
     schema = schema,
+    metadata_schema = metadata_schema,
     run_policy = "prefer_route",
 }
 
 
-function _M.check_schema(conf)
+function _M.check_schema(conf, schema_type)
+    if schema_type == core.schema.TYPE_METADATA then
+        return core.schema.check(metadata_schema, conf)
+    end
+
     local ok, err = core.schema.check(schema, conf)
     if not ok then
         return false, err
diff --git a/apisix/plugins/prometheus/exporter.lua 
b/apisix/plugins/prometheus/exporter.lua
index cc4673839..90c5d810d 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -77,7 +77,6 @@ local exporter_timer_running = false
 
 local exporter_timer_created = false
 
-
 local function gen_arr(...)
     clear_tab(inner_tab_arr)
     for i = 1, select('#', ...) do
@@ -113,7 +112,99 @@ local function extra_labels(name, ctx)
 end
 
 
-local _M = {}
+local lrucache = core.lrucache.new({
+    type = "plugin",
+})
+
+
+local metric_label_map = {
+    http_status = {"code", "route", "matched_uri", "matched_host", "service", 
"consumer", "node",
+        "request_type", "request_llm_model", "llm_model", "response_source"},
+    http_latency = {"type", "route", "service", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    bandwidth = {"type", "route", "service", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    llm_latency = {"type", "route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    llm_prompt_tokens = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    llm_completion_tokens = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    llm_active_connections = {"route", "route_id", "matched_uri", 
"matched_host",
+        "service", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    llm_prompt_tokens_dist = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    llm_completion_tokens_dist = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+}
+
+
+local function append_tables(...)
+    local res = {}
+    for _, tab in ipairs({...}) do
+        for _, v in ipairs(tab) do
+            core.table.insert(res, v)
+        end
+    end
+    return res
+end
+
+
+-- shared and read-only: avoids allocating in the log hot path
+local empty_disabled_map = {}
+
+
+local function build_disabled_label_metric_map(disabled_labels)
+    local disabled_label_metric_map = {}
+    for metric_name, labels in pairs(disabled_labels) do
+        disabled_label_metric_map[metric_name] = {}
+        for _, label in ipairs(labels) do
+            disabled_label_metric_map[metric_name][label] = true
+        end
+    end
+    return disabled_label_metric_map
+end
+
+
+-- Returns metric_name -> {label = true}, rebuilt only when the metadata 
changes.
+local function get_disabled_label_metric_map()
+    local metadata = plugin.plugin_metadata(plugin_name)
+    if not (metadata and metadata.value and metadata.value.disabled_labels
+            and metadata.modifiedIndex) then
+        return empty_disabled_map
+    end
+
+    return lrucache(plugin_name, metadata.modifiedIndex,
+                    build_disabled_label_metric_map, 
metadata.value.disabled_labels)
+end
+
+
+local function get_enabled_label_values_for_metric(metric_name, 
disabled_label_metric_map, ...)
+    local label_values = gen_arr(...)
+
+    -- fast path: nothing disabled for this metric
+    local disabled_labels = disabled_label_metric_map[metric_name]
+    if not disabled_labels then
+        return label_values
+    end
+
+    -- iterate the ordered label list rather than `label_values`: a nil value
+    -- must not end the scan early, and extra_labels after the built-ins stay 
untouched
+    local metric_labels = metric_label_map[metric_name]
+    for i = 1, #metric_labels do
+        if disabled_labels[metric_labels[i]] then
+            label_values[i] = ""
+        end
+    end
+
+    return label_values
+end
+
+
+local _M = {
+    metric_label_map = metric_label_map,
+}
 
 
 local function init_stream_metrics()
@@ -211,10 +302,7 @@ function _M.http_init(prometheus_enabled_in_stream)
     -- no consumer in request.
     metrics.status = prometheus:counter("http_status",
             "HTTP status codes per service in APISIX",
-            {"code", "route", "matched_uri", "matched_host", "service", 
"consumer", "node",
-            "request_type", "request_llm_model", "llm_model",
-            "response_source",
-            unpack(extra_labels("http_status"))},
+            append_tables(metric_label_map.http_status, 
extra_labels("http_status")),
             status_metrics_exptime)
 
     local buckets = DEFAULT_BUCKETS
@@ -223,54 +311,45 @@ function _M.http_init(prometheus_enabled_in_stream)
     end
 
     metrics.latency = prometheus:histogram("http_latency",
-        "HTTP request latency in milliseconds per service in APISIX",
-        {"type", "route", "service", "consumer", "node",
-        "request_type", "request_llm_model", "llm_model",
-        unpack(extra_labels("http_latency"))},
-        buckets, latency_metrics_exptime)
+            "HTTP request latency in milliseconds per service in APISIX",
+            append_tables(metric_label_map.http_latency, 
extra_labels("http_latency")),
+            buckets, latency_metrics_exptime)
 
     metrics.bandwidth = prometheus:counter("bandwidth",
             "Total bandwidth in bytes consumed per service in APISIX",
-            {"type", "route", "service", "consumer", "node",
-            "request_type", "request_llm_model", "llm_model",
-            unpack(extra_labels("bandwidth"))},
+            append_tables(metric_label_map.bandwidth, 
extra_labels("bandwidth")),
             bandwidth_metrics_exptime)
 
     local llm_latency_buckets = DEFAULT_BUCKETS
     if attr and attr.llm_latency_buckets then
         llm_latency_buckets = attr.llm_latency_buckets
     end
+
     -- The "type" label distinguishes latency kinds, mirroring 
apisix_http_latency:
     --   total - full response latency (both ai_chat and ai_stream)
     --   ttft  - time to first token (ai_stream only)
     metrics.llm_latency = prometheus:histogram("llm_latency",
-        "LLM request latency in milliseconds",
-        {"type", "route_id", "service_id", "consumer", "node",
-        "request_type", "request_llm_model", "llm_model",
-        unpack(extra_labels("llm_latency"))},
-        llm_latency_buckets,
-        llm_latency_exptime)
+            "LLM request latency in milliseconds",
+            append_tables(metric_label_map.llm_latency, 
extra_labels("llm_latency")),
+            llm_latency_buckets,
+            llm_latency_exptime)
 
     metrics.llm_prompt_tokens = prometheus:counter("llm_prompt_tokens",
             "LLM service consumed prompt tokens",
-            {"route_id", "service_id", "consumer", "node",
-            "request_type", "request_llm_model", "llm_model",
-            unpack(extra_labels("llm_prompt_tokens"))},
+            append_tables(metric_label_map.llm_prompt_tokens,
+                          extra_labels("llm_prompt_tokens")),
             llm_prompt_tokens_exptime)
 
     metrics.llm_completion_tokens = prometheus:counter("llm_completion_tokens",
             "LLM service consumed completion tokens",
-            {"route_id", "service_id", "consumer", "node",
-            "request_type", "request_llm_model", "llm_model",
-            unpack(extra_labels("llm_completion_tokens"))},
+            append_tables(metric_label_map.llm_completion_tokens,
+                          extra_labels("llm_completion_tokens")),
             llm_completion_tokens_exptime)
 
     metrics.llm_active_connections = prometheus:gauge("llm_active_connections",
             "Number of active connections to LLM service",
-            {"route", "route_id", "matched_uri", "matched_host",
-            "service", "service_id", "consumer", "node",
-            "request_type", "request_llm_model", "llm_model",
-            unpack(extra_labels("llm_active_connections"))},
+            append_tables(metric_label_map.llm_active_connections,
+                          extra_labels("llm_active_connections")),
             llm_active_connections_exptime)
 
     local llm_prompt_tokens_buckets = DEFAULT_TOKEN_BUCKETS
@@ -279,9 +358,8 @@ function _M.http_init(prometheus_enabled_in_stream)
     end
     metrics.llm_prompt_tokens_dist = 
prometheus:histogram("llm_prompt_tokens_dist",
         "LLM prompt tokens distribution per request",
-        {"route_id", "service_id", "consumer", "node",
-        "request_type", "request_llm_model", "llm_model",
-        unpack(extra_labels("llm_prompt_tokens_dist"))},
+        append_tables(metric_label_map.llm_prompt_tokens_dist,
+                      extra_labels("llm_prompt_tokens_dist")),
         llm_prompt_tokens_buckets,
         llm_prompt_tokens_dist_exptime)
 
@@ -291,9 +369,8 @@ function _M.http_init(prometheus_enabled_in_stream)
     end
     metrics.llm_completion_tokens_dist = 
prometheus:histogram("llm_completion_tokens_dist",
         "LLM completion tokens distribution per request",
-        {"route_id", "service_id", "consumer", "node",
-        "request_type", "request_llm_model", "llm_model",
-        unpack(extra_labels("llm_completion_tokens_dist"))},
+        append_tables(metric_label_map.llm_completion_tokens_dist,
+                      extra_labels("llm_completion_tokens_dist")),
         llm_completion_tokens_buckets,
         llm_completion_tokens_dist_exptime)
 
@@ -329,6 +406,7 @@ end
 
 function _M.http_log(conf, ctx)
     local vars = ctx.var
+    local disabled_label_metric_map = get_disabled_label_metric_map()
 
     local route_id = ""
     local balancer_ip = ctx.balancer_ip or ""
@@ -358,43 +436,50 @@ function _M.http_log(conf, ctx)
     local response_source = core.response.get_response_source(ctx)
 
     metrics.status:inc(1,
-        gen_arr(vars.status, route_id, matched_uri, matched_host,
-                service_id, consumer_name, balancer_ip,
-                vars.request_type, vars.request_llm_model, vars.llm_model,
-                response_source,
-                unpack(extra_labels("http_status", ctx))))
+        get_enabled_label_values_for_metric("http_status", 
disabled_label_metric_map,
+            vars.status, route_id, matched_uri, matched_host,
+            service_id, consumer_name, balancer_ip,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
+            response_source,
+            unpack(extra_labels("http_status", ctx))))
 
     local latency, upstream_latency, apisix_latency = latency_details(ctx)
+
     local latency_extra_label_values = extra_labels("http_latency", ctx)
 
     metrics.latency:observe(latency,
-        gen_arr("request", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.request_llm_model, vars.llm_model,
-        unpack(latency_extra_label_values)))
+        get_enabled_label_values_for_metric("http_latency", 
disabled_label_metric_map,
+            "request", route_id, service_id, consumer_name, balancer_ip,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
+            unpack(latency_extra_label_values)))
 
     if upstream_latency then
         metrics.latency:observe(upstream_latency,
-            gen_arr("upstream", route_id, service_id, consumer_name, 
balancer_ip,
-            vars.request_type, vars.request_llm_model, vars.llm_model,
-            unpack(latency_extra_label_values)))
+            get_enabled_label_values_for_metric("http_latency", 
disabled_label_metric_map,
+                "upstream", route_id, service_id, consumer_name, balancer_ip,
+                vars.request_type, vars.request_llm_model, vars.llm_model,
+                unpack(latency_extra_label_values)))
     end
 
     metrics.latency:observe(apisix_latency,
-        gen_arr("apisix", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.request_llm_model, vars.llm_model,
-        unpack(latency_extra_label_values)))
+        get_enabled_label_values_for_metric("http_latency", 
disabled_label_metric_map,
+            "apisix", route_id, service_id, consumer_name, balancer_ip,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
+            unpack(latency_extra_label_values)))
 
     local bandwidth_extra_label_values = extra_labels("bandwidth", ctx)
 
     metrics.bandwidth:inc(vars.request_length,
-        gen_arr("ingress", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.request_llm_model, vars.llm_model,
-        unpack(bandwidth_extra_label_values)))
+        get_enabled_label_values_for_metric("bandwidth", 
disabled_label_metric_map,
+            "ingress", route_id, service_id, consumer_name, balancer_ip,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
+            unpack(bandwidth_extra_label_values)))
 
     metrics.bandwidth:inc(vars.bytes_sent,
-        gen_arr("egress", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.request_llm_model, vars.llm_model,
-        unpack(bandwidth_extra_label_values)))
+        get_enabled_label_values_for_metric("bandwidth", 
disabled_label_metric_map,
+            "egress", route_id, service_id, consumer_name, balancer_ip,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
+            unpack(bandwidth_extra_label_values)))
 
     if vars.request_type == "ai_stream" or vars.request_type == "ai_chat" then
         local llm_time_to_first_token = vars.llm_time_to_first_token
@@ -404,35 +489,44 @@ function _M.http_log(conf, ctx)
             -- TTFT, so use apisix_upstream_response_time (refreshed on every
             -- chunk) to capture the time until the whole response completes.
             
metrics.llm_latency:observe(tonumber(vars.apisix_upstream_response_time),
-                gen_arr("total", route_id, service_id, consumer_name, 
balancer_ip,
+                get_enabled_label_values_for_metric("llm_latency", 
disabled_label_metric_map,
+                    "total", route_id, service_id, consumer_name, balancer_ip,
                     vars.request_type, vars.request_llm_model, vars.llm_model,
                     unpack(extra_labels("llm_latency", ctx))))
 
             -- type="ttft": time to first token, only streaming exposes a real 
one.
             if vars.request_type == "ai_stream" then
                 metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
-                    gen_arr("ttft", route_id, service_id, consumer_name, 
balancer_ip,
+                    get_enabled_label_values_for_metric("llm_latency", 
disabled_label_metric_map,
+                        "ttft", route_id, service_id, consumer_name, 
balancer_ip,
                         vars.request_type, vars.request_llm_model, 
vars.llm_model,
                         unpack(extra_labels("llm_latency", ctx))))
             end
         end
+
         metrics.llm_prompt_tokens:inc(tonumber(vars.llm_prompt_tokens),
-            gen_arr(route_id, service_id, consumer_name, balancer_ip,
+            get_enabled_label_values_for_metric("llm_prompt_tokens", 
disabled_label_metric_map,
+                route_id, service_id, consumer_name, balancer_ip,
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_prompt_tokens", ctx))))
 
         
metrics.llm_prompt_tokens_dist:observe(tonumber(vars.llm_prompt_tokens),
-            gen_arr(route_id, service_id, consumer_name, balancer_ip,
+            get_enabled_label_values_for_metric("llm_prompt_tokens_dist",
+                disabled_label_metric_map,
+                route_id, service_id, consumer_name, balancer_ip,
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_prompt_tokens_dist", ctx))))
 
         metrics.llm_completion_tokens:inc(tonumber(vars.llm_completion_tokens),
-            gen_arr(route_id, service_id, consumer_name, balancer_ip,
+            get_enabled_label_values_for_metric("llm_completion_tokens", 
disabled_label_metric_map,
+                route_id, service_id, consumer_name, balancer_ip,
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_completion_tokens", ctx))))
 
         
metrics.llm_completion_tokens_dist:observe(tonumber(vars.llm_completion_tokens),
-            gen_arr(route_id, service_id, consumer_name, balancer_ip,
+            get_enabled_label_values_for_metric("llm_completion_tokens_dist",
+                disabled_label_metric_map,
+                route_id, service_id, consumer_name, balancer_ip,
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_completion_tokens_dist", ctx))))
     end
@@ -830,12 +924,15 @@ local function inc_llm_active_connections(ctx, value)
         matched_host = ctx.curr_req_matched._host or ""
     end
 
+    local disabled_label_metric_map = get_disabled_label_metric_map()
+
     metrics.llm_active_connections:inc(
         value,
-        gen_arr(route_name, route_id, matched_uri,
+        get_enabled_label_values_for_metric("llm_active_connections", 
disabled_label_metric_map,
+            route_name, route_id, matched_uri,
             matched_host, service_name, service_id, consumer_name, balancer_ip,
             vars.request_type, vars.request_llm_model, vars.llm_model,
-        unpack(extra_labels("llm_active_connections", ctx)))
+            unpack(extra_labels("llm_active_connections", ctx)))
     )
 end
 
diff --git a/docs/en/latest/plugins/prometheus.md 
b/docs/en/latest/plugins/prometheus.md
index 2f92041fe..194d32042 100644
--- a/docs/en/latest/plugins/prometheus.md
+++ b/docs/en/latest/plugins/prometheus.md
@@ -94,6 +94,18 @@ Reload APISIX for changes to take effect.
 | ------------- | ------- | -------- | ------- | ------------ | 
------------------------------------------ |
 | prefer_name | boolean | False    | false   |              | If true, export 
Route/Service name instead of their ID in Prometheus metrics. |
 
+## Metadata
+
+You can configure the Plugin through its [Plugin 
Metadata](../terminology/plugin-metadata.md), which is set dynamically through 
the Admin API and takes effect at runtime without a restart.
+
+| Name            | Type   | Required | Description                            
                                                                                
                                                                                
                                                                                
    |
+| --------------- | ------ | -------- | 
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
+| disabled_labels | object | False    | Per-metric map of built-in label names 
whose values are collapsed to an empty string `""` to reduce metric 
cardinality. Keyed by metric name: `http_status`, `http_latency`, `bandwidth`, 
`llm_latency`, `llm_prompt_tokens`, `llm_completion_tokens`, 
`llm_active_connections`, `llm_prompt_tokens_dist`, 
`llm_completion_tokens_dist`. Structural labels that define a metric's identity 
(`code` on `http_status`, `type` on `http_latency`, `bandwidth` and `llm_ [...]
+
+Collapsing a label's value to `""` keeps the label registered in the metric 
schema, so existing dashboards, `absent()` alerts, and recording rules keep 
working — only the high-cardinality time series that differ solely by those 
labels are collapsed into one. This is useful in dynamic environments such as 
Kubernetes autoscaling, where the upstream node IP (`node` label) churns 
rapidly and would otherwise overflow the `prometheus-metrics` shared dict.
+
+See [Reduce Metric Cardinality by Disabling 
Labels](#reduce-metric-cardinality-by-disabling-labels) for an example.
+
 ## Metrics
 
 There are different types of metrics in Prometheus. To understand their 
differences, see [metrics 
types](https://prometheus.io/docs/concepts/metric_types/).
@@ -509,6 +521,66 @@ You should see an output similar to the following:
 
apisix_http_status{code="200",route="1",matched_uri="/get",matched_host="",service="",consumer="",node="54.237.103.220",upstream_addr="54.237.103.220:80",route_name="extra-label"}
 1
 ```
 
+### Reduce Metric Cardinality by Disabling Labels
+
+The following example demonstrates how to reduce metric cardinality by 
collapsing the values of selected built-in labels to an empty string `""` using 
the [Plugin Metadata](../terminology/plugin-metadata.md). This is useful in 
dynamic environments such as Kubernetes autoscaling, where the upstream node IP 
(`node` label) churns rapidly and would otherwise overflow the 
`prometheus-metrics` shared dict.
+
+Collapsing a label's value keeps the label registered in the metric schema, so 
existing dashboards, `absent()` alerts, and recording rules keep working. 
Structural labels that define a metric's identity (`code` on `http_status`, 
`type` on `http_latency`, `bandwidth` and `llm_latency`) cannot be disabled.
+
+Create a Route with the `prometheus` Plugin:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes"; -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "id": "prometheus-route",
+    "uri": "/get",
+    "plugins": {
+      "prometheus": {}
+    },
+    "upstream": {
+      "nodes": {
+        "httpbin.org:80": 1
+      }
+    }
+  }'
+```
+
+Configure the Plugin Metadata to collapse the `node` and `consumer` labels on 
`apisix_http_status` and the `node` label on `apisix_http_latency`:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/plugin_metadata/prometheus"; -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "disabled_labels": {
+      "http_status": ["node", "consumer"],
+      "http_latency": ["node"]
+    }
+  }'
+```
+
+Send a request to the Route to verify:
+
+```shell
+curl -i "http://127.0.0.1:9080/get";
+```
+
+You should see an `HTTP/1.1 200 OK` response.
+
+Send a request to the APISIX Prometheus metrics endpoint:
+
+```shell
+curl "http://127.0.0.1:9091/apisix/prometheus/metrics";
+```
+
+You should see that `node` and `consumer` are collapsed to empty strings on 
`apisix_http_status`, while metrics that are not listed (such as 
`apisix_bandwidth`) keep all their label values:
+
+```text
+# HELP apisix_http_status HTTP status codes per service in APISIX
+# TYPE apisix_http_status counter
+apisix_http_status{code="200",route="prometheus-route",matched_uri="/get",matched_host="",service="",consumer="",node="",request_type="traditional_http",request_llm_model="",llm_model="",response_source="upstream"}
 1
+```
+
 ### Monitor TCP/UDP Traffic with Prometheus
 
 The following example demonstrates how to collect TCP/UDP traffic metrics in 
APISIX.
diff --git a/docs/zh/latest/plugins/prometheus.md 
b/docs/zh/latest/plugins/prometheus.md
index ff3963a6e..d7eed001a 100644
--- a/docs/zh/latest/plugins/prometheus.md
+++ b/docs/zh/latest/plugins/prometheus.md
@@ -94,6 +94,18 @@ plugin_attr:
 | ------------ | --------| ------ | ------ | 
----------------------------------------------------- |
 |prefer_name | boolean | 否     | false  | 当设置为 `true` 时，则在`prometheus` 
指标中导出路由/服务名称而非它们的 `id`。 |
 
+## 元数据
+
+你可以通过插件的[元数据（Plugin Metadata）](../terminology/plugin-metadata.md)进行配置。元数据通过 
Admin API 动态设置，无需重启即可在运行时生效。
+
+| 名称            | 类型   | 必选项 | 描述                                              
                                                                                
                                                                                
                                                |
+| --------------- | ------ | ------ | 
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
+| disabled_labels | object | 否     | 按指标配置的内置标签列表，列出的标签其值会被设置为空字符串 `""` 
以降低指标基数。以指标名称作为键：`http_status`、`http_latency`、`bandwidth`、`llm_latency`、`llm_prompt_tokens`、`llm_completion_tokens`、`llm_active_connections`、`llm_prompt_tokens_dist`、`llm_completion_tokens_dist`。定义指标本身含义的结构性标签（`http_status`
 的 `code`、`http_latency`、`bandwidth` 与 `llm_latency` 的 `type`）不可被禁用。 |
+
+将标签值设置为 `""` 时，标签仍保留在指标 schema 中，因此现有的仪表盘、`absent()` 告警和 recording rule 
都不受影响——只是将仅因这些标签而不同的高基数时间序列合并为一条。这在 Kubernetes 弹性伸缩等动态环境中尤其有用：此时上游节点 IP（`node` 
标签）频繁变化，否则会很快撑爆 `prometheus-metrics` 共享字典。
+
+示例请参见[通过禁用标签降低指标基数](#通过禁用标签降低指标基数)。
+
 ## 指标
 
 Prometheus 
中有不同类型的指标。要了解它们之间的区别，请参见[指标类型](https://prometheus.io/docs/concepts/metric_types/)。
@@ -508,6 +520,66 @@ curl "http://127.0.0.1:9091/apisix/prometheus/metrics";
 
apisix_http_status{code="200",route="1",matched_uri="/get",matched_host="",service="",consumer="",node="54.237.103.220",upstream_addr="54.237.103.220:80",route_name="extra-label"}
 1
 ```
 
+### 通过禁用标签降低指标基数
+
+以下示例演示如何通过[插件元数据（Plugin 
Metadata）](../terminology/plugin-metadata.md)将选定内置标签的值折叠为空字符串 `""`，从而降低指标基数。这在 
Kubernetes 弹性伸缩等动态环境中尤其有用：此时上游节点 IP（`node` 标签）频繁变化，否则会很快撑爆 `prometheus-metrics` 
共享字典。
+
+将标签值折叠后，标签仍保留在指标 schema 中，因此现有的仪表盘、`absent()` 告警和 recording rule 
都不受影响。定义指标本身含义的结构性标签（`http_status` 的 `code`、`http_latency`、`bandwidth` 与 
`llm_latency` 的 `type`）不可被禁用。
+
+创建一个启用 `prometheus` 插件的路由：
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes"; -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "id": "prometheus-route",
+    "uri": "/get",
+    "plugins": {
+      "prometheus": {}
+    },
+    "upstream": {
+      "nodes": {
+        "httpbin.org:80": 1
+      }
+    }
+  }'
+```
+
+配置插件元数据，将 `apisix_http_status` 的 `node` 和 `consumer` 标签、以及 
`apisix_http_latency` 的 `node` 标签的值折叠：
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/plugin_metadata/prometheus"; -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "disabled_labels": {
+      "http_status": ["node", "consumer"],
+      "http_latency": ["node"]
+    }
+  }'
+```
+
+向路由发送请求以进行验证：
+
+```shell
+curl -i "http://127.0.0.1:9080/get";
+```
+
+你应该看到 `HTTP/1.1 200 OK` 的响应。
+
+向 APISIX Prometheus 指标端点发送请求：
+
+```shell
+curl "http://127.0.0.1:9091/apisix/prometheus/metrics";
+```
+
+你应该看到 `apisix_http_status` 中的 `node` 和 `consumer` 被折叠为空字符串，而未列出的指标（如 
`apisix_bandwidth`）仍保留其所有标签值：
+
+```text
+# HELP apisix_http_status APISIX 中每个服务的 HTTP 状态代码
+# TYPE apisix_http_status counter
+apisix_http_status{code="200",route="prometheus-route",matched_uri="/get",matched_host="",service="",consumer="",node="",request_type="traditional_http",request_llm_model="",llm_model="",response_source="upstream"}
 1
+```
+
 ### 使用 Prometheus 监控 TCP/UDP 流量
 
 以下示例演示如何在 APISIX 中收集 TCP/UDP 流量指标。
diff --git a/t/plugin/prometheus-label-filter.t 
b/t/plugin/prometheus-label-filter.t
new file mode 100644
index 000000000..fe1324c1c
--- /dev/null
+++ b/t/plugin/prometheus-label-filter.t
@@ -0,0 +1,172 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+BEGIN {
+    if ($ENV{TEST_NGINX_CHECK_LEAK}) {
+        $SkipReason = "unavailable for the hup tests";
+
+    } else {
+        $ENV{TEST_NGINX_USE_HUP} = 1;
+        undef $ENV{TEST_NGINX_USE_STAP};
+    }
+}
+
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    if (!defined $block->yaml_config) {
+        $block->set_value("yaml_config", <<'EOF');
+plugin_attr:
+    prometheus:
+        refresh_interval: 0.1
+EOF
+    }
+});
+
+run_tests;
+
+__DATA__
+
+=== TEST 1: set up routes and disable labels per-metric via plugin metadata
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+
+            local code = t('/apisix/admin/routes/metrics',
+                ngx.HTTP_PUT,
+                [[{
+                    "plugins": {"public-api": {}},
+                    "uri": "/apisix/prometheus/metrics"
+                }]])
+            if code >= 300 then
+                ngx.status = code
+                ngx.say("failed to create metrics route")
+                return
+            end
+
+            code = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "plugins": {"prometheus": {}},
+                    "upstream": {
+                        "nodes": {"127.0.0.1:1980": 1},
+                        "type": "roundrobin"
+                    },
+                    "uri": "/hello"
+                }]])
+            if code >= 300 then
+                ngx.status = code
+                ngx.say("failed to create route 1")
+                return
+            end
+
+            local code, body = t('/apisix/admin/plugin_metadata/prometheus',
+                ngx.HTTP_PUT,
+                [[{
+                    "disabled_labels": {
+                        "http_status": ["route", "node"],
+                        "bandwidth": ["node"]
+                    }
+                }]])
+            if code >= 300 then
+                ngx.status = code
+                ngx.say(body)
+                return
+            end
+
+            -- give the data plane time to sync the routes and plugin metadata
+            ngx.sleep(1.5)
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: warm up metrics with client requests
+--- pipelined_requests eval
+["GET /hello", "GET /hello", "GET /hello"]
+--- error_code eval
+[200, 200, 200]
+
+
+
+=== TEST 3: http_status has disabled labels (route, node) collapsed to ""
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_http_status\{code="\d+",route="",matched_uri="[^"]*",matched_host="[^"]*",service="[^"]*",consumer="[^"]*",node="",request_type="[^"]*",request_llm_model="[^"]*",llm_model="[^"]*",response_source="[^"]*"\}
 \d+/
+
+
+
+=== TEST 4: per-metric scoping - http_latency keeps route and node populated
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_http_latency_count\{type="request",route="1",service="[^"]*",consumer="[^"]*",node="127.0.0.1",request_type="[^"]*",request_llm_model="[^"]*",llm_model="[^"]*"\}
 \d+/
+
+
+
+=== TEST 5: per-metric scoping - bandwidth collapses node but keeps route
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_bandwidth\{type="(?:ingress|egress)",route="1",service="[^"]*",consumer="[^"]*",node="",request_type="[^"]*",request_llm_model="[^"]*",llm_model="[^"]*"\}
 \d+/
+
+
+
+=== TEST 6: reject disabling a structural label (`code` on http_status)
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/plugin_metadata/prometheus',
+                ngx.HTTP_PUT,
+                [[{"disabled_labels": {"http_status": ["code"]}}]])
+            ngx.say(body)
+        }
+    }
+--- response_body eval
+qr/failed to validate item 1/
+
+
+
+=== TEST 7: reject an unknown metric key (additionalProperties = false)
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/plugin_metadata/prometheus',
+                ngx.HTTP_PUT,
+                [[{"disabled_labels": {"unknown_metric": ["node"]}}]])
+            ngx.say(body)
+        }
+    }
+--- response_body eval
+qr/additional properties forbidden/

(apisix) branch master updated: feat(prometheus): support disabling labels via plugin metadata to reduce cardinality (#13202)

Reply via email to