This is an automated email from the ASF dual-hosted git repository.
AlinsRan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new 418a8c697 feat(prometheus): add built-in LLM histograms for TTFT and
token distribution (#13487)
418a8c697 is described below
commit 418a8c6978a06811fbeed68dd6b84170cb1115d6
Author: AlinsRan <[email protected]>
AuthorDate: Wed Jun 10 16:53:07 2026 +0800
feat(prometheus): add built-in LLM histograms for TTFT and token
distribution (#13487)
---
apisix/plugins/prometheus/exporter.lua | 65 ++++++++++++-
conf/config.yaml.example | 8 ++
docs/en/latest/plugins/prometheus.md | 32 +++++++
docs/zh/latest/plugins/prometheus.md | 32 +++++++
t/plugin/prometheus-ai-proxy.t | 163 +++++++++++++++++++++++++++++++++
5 files changed, 297 insertions(+), 3 deletions(-)
diff --git a/apisix/plugins/prometheus/exporter.lua
b/apisix/plugins/prometheus/exporter.lua
index ce89ca033..cc4673839 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -58,6 +58,12 @@ local plugin_name = "prometheus"
local default_export_uri = "/apisix/prometheus/metrics"
-- Default set of latency buckets, 1ms to 60s:
local DEFAULT_BUCKETS = {1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
10000, 30000, 60000}
+-- Default set of LLM token buckets, suitable for prompt/completion token
counts.
+-- OTel GenAI semconv does not prescribe bucket boundaries for token
histograms,
+-- so these are tuned to real-world token ranges (dense around common prompt
+-- sizes) with the upper bound raised to 1M to cover large-context models.
+local DEFAULT_TOKEN_BUCKETS = {1, 10, 50, 100, 200, 500, 1000, 2000, 5000,
10000,
+ 20000, 50000, 100000, 200000, 500000, 1000000}
-- Default refresh interval
local DEFAULT_REFRESH_INTERVAL = 15
@@ -160,6 +166,10 @@ function _M.http_init(prometheus_enabled_in_stream)
"llm_completion_tokens", "expire")
local llm_active_connections_exptime = core.table.try_read_attr(attr,
"metrics",
"llm_active_connections", "expire")
+ local llm_prompt_tokens_dist_exptime = core.table.try_read_attr(attr,
"metrics",
+
"llm_prompt_tokens_dist", "expire")
+ local llm_completion_tokens_dist_exptime = core.table.try_read_attr(attr,
"metrics",
+
"llm_completion_tokens_dist", "expire")
prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
@@ -230,9 +240,12 @@ function _M.http_init(prometheus_enabled_in_stream)
if attr and attr.llm_latency_buckets then
llm_latency_buckets = attr.llm_latency_buckets
end
+ -- The "type" label distinguishes latency kinds, mirroring
apisix_http_latency:
+ -- total - full response latency (both ai_chat and ai_stream)
+ -- ttft - time to first token (ai_stream only)
metrics.llm_latency = prometheus:histogram("llm_latency",
"LLM request latency in milliseconds",
- {"route_id", "service_id", "consumer", "node",
+ {"type", "route_id", "service_id", "consumer", "node",
"request_type", "request_llm_model", "llm_model",
unpack(extra_labels("llm_latency"))},
llm_latency_buckets,
@@ -260,6 +273,30 @@ function _M.http_init(prometheus_enabled_in_stream)
unpack(extra_labels("llm_active_connections"))},
llm_active_connections_exptime)
+ local llm_prompt_tokens_buckets = DEFAULT_TOKEN_BUCKETS
+ if attr and attr.llm_prompt_tokens_buckets then
+ llm_prompt_tokens_buckets = attr.llm_prompt_tokens_buckets
+ end
+ metrics.llm_prompt_tokens_dist =
prometheus:histogram("llm_prompt_tokens_dist",
+ "LLM prompt tokens distribution per request",
+ {"route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model",
+ unpack(extra_labels("llm_prompt_tokens_dist"))},
+ llm_prompt_tokens_buckets,
+ llm_prompt_tokens_dist_exptime)
+
+ local llm_completion_tokens_buckets = DEFAULT_TOKEN_BUCKETS
+ if attr and attr.llm_completion_tokens_buckets then
+ llm_completion_tokens_buckets = attr.llm_completion_tokens_buckets
+ end
+ metrics.llm_completion_tokens_dist =
prometheus:histogram("llm_completion_tokens_dist",
+ "LLM completion tokens distribution per request",
+ {"route_id", "service_id", "consumer", "node",
+ "request_type", "request_llm_model", "llm_model",
+ unpack(extra_labels("llm_completion_tokens_dist"))},
+ llm_completion_tokens_buckets,
+ llm_completion_tokens_dist_exptime)
+
if prometheus_enabled_in_stream then
init_stream_metrics()
end
@@ -362,20 +399,42 @@ function _M.http_log(conf, ctx)
if vars.request_type == "ai_stream" or vars.request_type == "ai_chat" then
local llm_time_to_first_token = vars.llm_time_to_first_token
if llm_time_to_first_token ~= "0" then
- metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
- gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ -- type="total": full response latency. For non-streaming this
equals
+ -- llm_time_to_first_token; for streaming, that var holds only the
+ -- TTFT, so use apisix_upstream_response_time (refreshed on every
+ -- chunk) to capture the time until the whole response completes.
+
metrics.llm_latency:observe(tonumber(vars.apisix_upstream_response_time),
+ gen_arr("total", route_id, service_id, consumer_name,
balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_latency", ctx))))
+
+ -- type="ttft": time to first token, only streaming exposes a real
one.
+ if vars.request_type == "ai_stream" then
+ metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
+ gen_arr("ttft", route_id, service_id, consumer_name,
balancer_ip,
+ vars.request_type, vars.request_llm_model,
vars.llm_model,
+ unpack(extra_labels("llm_latency", ctx))))
+ end
end
metrics.llm_prompt_tokens:inc(tonumber(vars.llm_prompt_tokens),
gen_arr(route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_prompt_tokens", ctx))))
+
metrics.llm_prompt_tokens_dist:observe(tonumber(vars.llm_prompt_tokens),
+ gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(extra_labels("llm_prompt_tokens_dist", ctx))))
+
metrics.llm_completion_tokens:inc(tonumber(vars.llm_completion_tokens),
gen_arr(route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_completion_tokens", ctx))))
+
+
metrics.llm_completion_tokens_dist:observe(tonumber(vars.llm_completion_tokens),
+ gen_arr(route_id, service_id, consumer_name, balancer_ip,
+ vars.request_type, vars.request_llm_model, vars.llm_model,
+ unpack(extra_labels("llm_completion_tokens_dist", ctx))))
end
end
diff --git a/conf/config.yaml.example b/conf/config.yaml.example
index 58052cb3d..e26dbd5a3 100644
--- a/conf/config.yaml.example
+++ b/conf/config.yaml.example
@@ -674,6 +674,14 @@ plugin_attr: # Plugin attributes
# - 100
# - 200
# - 500
+ # llm_prompt_tokens_buckets: # buckets for
apisix_llm_prompt_tokens_dist histogram (unit: token)
+ # - 100
+ # - 1000
+ # - 10000
+ # llm_completion_tokens_buckets: # buckets for
apisix_llm_completion_tokens_dist histogram (unit: token)
+ # - 100
+ # - 1000
+ # - 10000
server-info: # Plugin: server-info
report_ttl: 60 # Set the TTL in seconds for server info
in etcd.
# Maximum: 86400. Minimum: 3.
diff --git a/docs/en/latest/plugins/prometheus.md
b/docs/en/latest/plugins/prometheus.md
index 33d87ed80..2f92041fe 100644
--- a/docs/en/latest/plugins/prometheus.md
+++ b/docs/en/latest/plugins/prometheus.md
@@ -157,8 +157,14 @@ The following labels are used to differentiate
`apisix_bandwidth` metrics.
### Labels for `apisix_llm_latency`
+The `type` label distinguishes the kind of latency, similar to
`apisix_http_latency`:
+
+- `total`: the full response latency, recorded for both `ai_chat` and
`ai_stream` requests.
+- `ttft`: the time to first token, recorded for `ai_stream` requests only
(non-streaming responses do not expose a first-token moment).
+
| Name | Description
|
| ---------- |
-----------------------------------------------------------------------------------------------------------------------------
|
|
+| type | Kind of latency: `total` or `ttft`.
|
| route_id | ID of the Route that bandwidth corresponds to when
`prefer_name` is `false` (default), and name of the Route when `prefer_name` to
`true`. Default to an empty string if a request does not match any Route.
|
| service_id | ID of the Service that bandwidth corresponds to when
`prefer_name` is `false` (default), and name of the Service when `prefer_name`
to `true`. Default to the configured value of host on the Route if the matched
Route does not belong to any Service. |
| consumer | Name of the Consumer associated with a request. Default to an
empty string if no Consumer is associated with the request.
|
@@ -203,6 +209,32 @@ The following labels are used to differentiate
`apisix_bandwidth` metrics.
| request_type | traditional_http / ai_chat / ai_stream
|
| llm_model | For non-traditional_http requests, name of the llm_model
|
+### Labels for `apisix_llm_prompt_tokens_dist`
+
+`apisix_llm_prompt_tokens_dist` is a histogram of prompt tokens consumed per
request, complementing the `apisix_llm_prompt_tokens` counter with a
distribution so that quantiles (such as p95 prompt size) can be computed.
+
+| Name | Description
|
+| ---------- |
-----------------------------------------------------------------------------------------------------------------------------
|
+| route_id | ID of the Route that the metric corresponds to when
`prefer_name` is `false` (default), and name of the Route when `prefer_name` to
`true`. Default to an empty string if a request does not match any Route.
|
+| service_id | ID of the Service that the metric corresponds to when
`prefer_name` is `false` (default), and name of the Service when `prefer_name`
to `true`. Default to the configured value of host on the Route if the matched
Route does not belong to any Service. |
+| consumer | Name of the Consumer associated with a request. Default to an
empty string if no Consumer is associated with the request.
|
+| node | IP address of the upstream node.
|
+| request_type | traditional_http / ai_chat / ai_stream
|
+| llm_model | For non-traditional_http requests, name of the llm_model
|
+
+### Labels for `apisix_llm_completion_tokens_dist`
+
+`apisix_llm_completion_tokens_dist` is a histogram of completion tokens
generated per request, complementing the `apisix_llm_completion_tokens` counter
with a distribution.
+
+| Name | Description
|
+| ---------- |
-----------------------------------------------------------------------------------------------------------------------------
|
+| route_id | ID of the Route that the metric corresponds to when
`prefer_name` is `false` (default), and name of the Route when `prefer_name` to
`true`. Default to an empty string if a request does not match any Route.
|
+| service_id | ID of the Service that the metric corresponds to when
`prefer_name` is `false` (default), and name of the Service when `prefer_name`
to `true`. Default to the configured value of host on the Route if the matched
Route does not belong to any Service. |
+| consumer | Name of the Consumer associated with a request. Default to an
empty string if no Consumer is associated with the request.
|
+| node | IP address of the upstream node.
|
+| request_type | traditional_http / ai_chat / ai_stream
|
+| llm_model | For non-traditional_http requests, name of the llm_model
|
+
### Labels for `apisix_http_latency`
The following labels are used to differentiate `apisix_http_latency` metrics.
diff --git a/docs/zh/latest/plugins/prometheus.md
b/docs/zh/latest/plugins/prometheus.md
index 2cc46005a..ff3963a6e 100644
--- a/docs/zh/latest/plugins/prometheus.md
+++ b/docs/zh/latest/plugins/prometheus.md
@@ -157,8 +157,14 @@ Prometheus 中有不同类型的指标。要了解它们之间的区别,请参
### `apisix_llm_latency` 的标签
+`type` 标签用于区分延迟类型,与 `apisix_http_latency` 类似:
+
+- `total`:完整的响应延迟,`ai_chat` 和 `ai_stream` 请求都会记录。
+- `ttft`:首个 token 到达时间,仅 `ai_stream` 请求记录(非流式响应没有"首个 token"这一时刻)。
+
| 名称 | 描述 |
| ---------- |
-----------------------------------------------------------------------------------------------------------------------------
|
+| type | 延迟类型:`total` 或 `ttft`。
|
| route_id | 请求对应的路由 ID,当 `prefer_name` 为 `false`(默认)时,使用路由 ID,当
`prefer_name` 为 `true` 时,使用路由名称。如果请求不匹配任何路由,则默认为空字符串。 |
| service_id | 请求对应的服务 ID,当 `prefer_name` 为 `false`(默认)时,使用服务 ID,当
`prefer_name` 为 `true` 时,使用服务名称。如果匹配的路由不属于任何服务,则默认为路由上配置的主机值。 |
| consumer | 与请求关联的消费者名称。如果请求没有与之关联的消费者,则默认为空字符串。 |
@@ -203,6 +209,32 @@ Prometheus 中有不同类型的指标。要了解它们之间的区别,请参
| request_type | traditional_http / ai_chat / ai_stream
|
| llm_model | 对于非传统的 http 请求,llm 模型的名称
|
+### `apisix_llm_prompt_tokens_dist` 的标签
+
+`apisix_llm_prompt_tokens_dist` 是每次请求消耗的 prompt token 数的直方图,作为
`apisix_llm_prompt_tokens` 计数器的补充,提供分布信息以便计算分位数(如 p95 prompt 大小)。
+
+| 名称 | 描述 |
+| ---------- |
-----------------------------------------------------------------------------------------------------------------------------
|
+| route_id | 请求对应的路由 ID,当 `prefer_name` 为 `false`(默认)时,使用路由 ID,当
`prefer_name` 为 `true` 时,使用路由名称。如果请求不匹配任何路由,则默认为空字符串。 |
+| service_id | 请求对应的服务 ID,当 `prefer_name` 为 `false`(默认)时,使用服务 ID,当
`prefer_name` 为 `true` 时,使用服务名称。如果匹配的路由不属于任何服务,则默认为路由上配置的主机值。 |
+| consumer | 与请求关联的消费者名称。如果请求没有与之关联的消费者,则默认为空字符串。 |
+| node | 上游节点的 IP 地址。
|
+| request_type | traditional_http / ai_chat / ai_stream
|
+| llm_model | 对于非传统的 http 请求,llm 模型的名称
|
+
+### `apisix_llm_completion_tokens_dist` 的标签
+
+`apisix_llm_completion_tokens_dist` 是每次请求生成的 completion token 数的直方图,作为
`apisix_llm_completion_tokens` 计数器的补充,提供分布信息。
+
+| 名称 | 描述 |
+| ---------- |
-----------------------------------------------------------------------------------------------------------------------------
|
+| route_id | 请求对应的路由 ID,当 `prefer_name` 为 `false`(默认)时,使用路由 ID,当
`prefer_name` 为 `true` 时,使用路由名称。如果请求不匹配任何路由,则默认为空字符串。 |
+| service_id | 请求对应的服务 ID,当 `prefer_name` 为 `false`(默认)时,使用服务 ID,当
`prefer_name` 为 `true` 时,使用服务名称。如果匹配的路由不属于任何服务,则默认为路由上配置的主机值。 |
+| consumer | 与请求关联的消费者名称。如果请求没有与之关联的消费者,则默认为空字符串。 |
+| node | 上游节点的 IP 地址。
|
+| request_type | traditional_http / ai_chat / ai_stream
|
+| llm_model | 对于非传统的 http 请求,llm 模型的名称
|
+
### `apisix_http_latency` 的标签
以下标签用于区分 `apisix_http_latency` 指标。
diff --git a/t/plugin/prometheus-ai-proxy.t b/t/plugin/prometheus-ai-proxy.t
index d46869623..9a641825b 100644
--- a/t/plugin/prometheus-ai-proxy.t
+++ b/t/plugin/prometheus-ai-proxy.t
@@ -269,3 +269,166 @@
qr/apisix_llm_active_connections\{.*route_id="1",.*,node="openai-gpt4".*.*reques
GET /t
--- response_body
success
+
+
+
+=== TEST 11: create a non-streaming route for token distribution histograms
+--- config
+ location /t {
+ content_by_lua_block {
+ local data = {
+ {
+ url = "/apisix/admin/routes/3",
+ data = [[{
+ "plugins": {
+ "prometheus": {},
+ "ai-proxy-multi": {
+ "instances": [
+ {
+ "name": "openai-gpt4",
+ "provider": "openai",
+ "weight": 1,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980"
+ }
+ }
+ ]
+ }
+ },
+ "uri": "/chat-dist"
+ }]],
+ },
+ }
+ local t = require("lib.test_admin").test
+ for _, data in ipairs(data) do
+ local _, body = t(data.url, ngx.HTTP_PUT, data.data)
+ ngx.say(body)
+ end
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 12: send a non-streaming chat request
+--- request
+POST /chat-dist
+{"messages":[{"role":"user","content":"What is 1+1?"}], "model": "gpt-3"}
+--- more_headers
+X-AI-Fixture: prometheus/chat-basic.json
+--- error_code: 200
+
+
+
+=== TEST 13: assert llm_prompt_tokens_dist_count metric
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_prompt_tokens_dist_count\{.*route_id="3",.*,node="openai-gpt4".*request_type="ai_chat",request_llm_model="gpt-3",llm_model="gpt-4"\}
1/
+
+
+
+=== TEST 14: assert llm_completion_tokens_dist_count metric
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_completion_tokens_dist_count\{.*route_id="3",.*,node="openai-gpt4".*request_type="ai_chat",request_llm_model="gpt-3",llm_model="gpt-4"\}
1/
+
+
+
+=== TEST 15: llm_latency type=ttft is not recorded for non-streaming requests
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_unlike eval
+qr/apisix_llm_latency_count\{type="ttft",.*route_id="3"/
+
+
+
+=== TEST 16: create a streaming route for the TTFT histogram
+--- config
+ location /t {
+ content_by_lua_block {
+ local data = {
+ {
+ url = "/apisix/admin/routes/4",
+ data = [[{
+ "plugins": {
+ "prometheus": {},
+ "ai-proxy-multi": {
+ "instances": [
+ {
+ "name": "openai-gpt4",
+ "provider": "openai",
+ "weight": 1,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980"
+ }
+ }
+ ]
+ }
+ },
+ "uri": "/chat-stream"
+ }]],
+ },
+ }
+ local t = require("lib.test_admin").test
+ for _, data in ipairs(data) do
+ local _, body = t(data.url, ngx.HTTP_PUT, data.data)
+ ngx.say(body)
+ end
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 17: send a streaming chat request
+--- request
+POST /chat-stream
+{"messages":[{"role":"user","content":"What is 1+1?"}], "model": "gpt-3",
"stream": true}
+--- more_headers
+X-AI-Fixture: openai/chat-streaming.sse
+--- response_headers_like
+Content-Type: text/event-stream
+
+
+
+=== TEST 18: assert llm_latency type=ttft count for the streaming request
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_latency_count\{type="ttft",.*route_id="4",.*,node="openai-gpt4".*request_type="ai_stream",request_llm_model="gpt-3",llm_model="gpt-4"\}
1/
+
+
+
+=== TEST 19: assert llm_latency type=ttft bucket for the streaming request
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_latency_bucket\{type="ttft",.*route_id="4",.*,node="openai-gpt4".*request_type="ai_stream",request_llm_model="gpt-3",llm_model="gpt-4",le="\d+"\}
1/
+
+
+
+=== TEST 20: assert llm_latency type=total is also recorded for the streaming
request
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_latency_count\{type="total",.*route_id="4",.*,node="openai-gpt4".*request_type="ai_stream",request_llm_model="gpt-3",llm_model="gpt-4"\}
1/