(apisix) branch master updated: feat(prometheus): add built-in LLM histograms for TTFT and token distribution (#13487)

alinsran Wed, 10 Jun 2026 01:53:21 -0700

This is an automated email from the ASF dual-hosted git repository.

AlinsRan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git



The following commit(s) were added to refs/heads/master by this push:
     new 418a8c697 feat(prometheus): add built-in LLM histograms for TTFT and 
token distribution (#13487)
418a8c697 is described below

commit 418a8c6978a06811fbeed68dd6b84170cb1115d6
Author: AlinsRan <[email protected]>
AuthorDate: Wed Jun 10 16:53:07 2026 +0800

    feat(prometheus): add built-in LLM histograms for TTFT and token 
distribution (#13487)
---
 apisix/plugins/prometheus/exporter.lua |  65 ++++++++++++-
 conf/config.yaml.example               |   8 ++
 docs/en/latest/plugins/prometheus.md   |  32 +++++++
 docs/zh/latest/plugins/prometheus.md   |  32 +++++++
 t/plugin/prometheus-ai-proxy.t         | 163 +++++++++++++++++++++++++++++++++
 5 files changed, 297 insertions(+), 3 deletions(-)

diff --git a/apisix/plugins/prometheus/exporter.lua 
b/apisix/plugins/prometheus/exporter.lua
index ce89ca033..cc4673839 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -58,6 +58,12 @@ local plugin_name = "prometheus"
 local default_export_uri = "/apisix/prometheus/metrics"
 -- Default set of latency buckets, 1ms to 60s:
 local DEFAULT_BUCKETS = {1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 
10000, 30000, 60000}
+-- Default set of LLM token buckets, suitable for prompt/completion token 
counts.
+-- OTel GenAI semconv does not prescribe bucket boundaries for token 
histograms,
+-- so these are tuned to real-world token ranges (dense around common prompt
+-- sizes) with the upper bound raised to 1M to cover large-context models.
+local DEFAULT_TOKEN_BUCKETS = {1, 10, 50, 100, 200, 500, 1000, 2000, 5000, 
10000,
+                               20000, 50000, 100000, 200000, 500000, 1000000}
 -- Default refresh interval
 local DEFAULT_REFRESH_INTERVAL = 15
 
@@ -160,6 +166,10 @@ function _M.http_init(prometheus_enabled_in_stream)
                                                             
"llm_completion_tokens", "expire")
     local llm_active_connections_exptime = core.table.try_read_attr(attr, 
"metrics",
                                                             
"llm_active_connections", "expire")
+    local llm_prompt_tokens_dist_exptime = core.table.try_read_attr(attr, 
"metrics",
+                                                            
"llm_prompt_tokens_dist", "expire")
+    local llm_completion_tokens_dist_exptime = core.table.try_read_attr(attr, 
"metrics",
+                                                            
"llm_completion_tokens_dist", "expire")
 
     prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
 
@@ -230,9 +240,12 @@ function _M.http_init(prometheus_enabled_in_stream)
     if attr and attr.llm_latency_buckets then
         llm_latency_buckets = attr.llm_latency_buckets
     end
+    -- The "type" label distinguishes latency kinds, mirroring 
apisix_http_latency:
+    --   total - full response latency (both ai_chat and ai_stream)
+    --   ttft  - time to first token (ai_stream only)
     metrics.llm_latency = prometheus:histogram("llm_latency",
         "LLM request latency in milliseconds",
-        {"route_id", "service_id", "consumer", "node",
+        {"type", "route_id", "service_id", "consumer", "node",
         "request_type", "request_llm_model", "llm_model",
         unpack(extra_labels("llm_latency"))},
         llm_latency_buckets,
@@ -260,6 +273,30 @@ function _M.http_init(prometheus_enabled_in_stream)
             unpack(extra_labels("llm_active_connections"))},
             llm_active_connections_exptime)
 
+    local llm_prompt_tokens_buckets = DEFAULT_TOKEN_BUCKETS
+    if attr and attr.llm_prompt_tokens_buckets then
+        llm_prompt_tokens_buckets = attr.llm_prompt_tokens_buckets
+    end
+    metrics.llm_prompt_tokens_dist = 
prometheus:histogram("llm_prompt_tokens_dist",
+        "LLM prompt tokens distribution per request",
+        {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model",
+        unpack(extra_labels("llm_prompt_tokens_dist"))},
+        llm_prompt_tokens_buckets,
+        llm_prompt_tokens_dist_exptime)
+
+    local llm_completion_tokens_buckets = DEFAULT_TOKEN_BUCKETS
+    if attr and attr.llm_completion_tokens_buckets then
+        llm_completion_tokens_buckets = attr.llm_completion_tokens_buckets
+    end
+    metrics.llm_completion_tokens_dist = 
prometheus:histogram("llm_completion_tokens_dist",
+        "LLM completion tokens distribution per request",
+        {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model",
+        unpack(extra_labels("llm_completion_tokens_dist"))},
+        llm_completion_tokens_buckets,
+        llm_completion_tokens_dist_exptime)
+
     if prometheus_enabled_in_stream then
         init_stream_metrics()
     end
@@ -362,20 +399,42 @@ function _M.http_log(conf, ctx)
     if vars.request_type == "ai_stream" or vars.request_type == "ai_chat" then
         local llm_time_to_first_token = vars.llm_time_to_first_token
         if llm_time_to_first_token ~= "0" then
-            metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
-                gen_arr(route_id, service_id, consumer_name, balancer_ip,
+            -- type="total": full response latency. For non-streaming this 
equals
+            -- llm_time_to_first_token; for streaming, that var holds only the
+            -- TTFT, so use apisix_upstream_response_time (refreshed on every
+            -- chunk) to capture the time until the whole response completes.
+            
metrics.llm_latency:observe(tonumber(vars.apisix_upstream_response_time),
+                gen_arr("total", route_id, service_id, consumer_name, 
balancer_ip,
                     vars.request_type, vars.request_llm_model, vars.llm_model,
                     unpack(extra_labels("llm_latency", ctx))))
+
+            -- type="ttft": time to first token, only streaming exposes a real 
one.
+            if vars.request_type == "ai_stream" then
+                metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
+                    gen_arr("ttft", route_id, service_id, consumer_name, 
balancer_ip,
+                        vars.request_type, vars.request_llm_model, 
vars.llm_model,
+                        unpack(extra_labels("llm_latency", ctx))))
+            end
         end
         metrics.llm_prompt_tokens:inc(tonumber(vars.llm_prompt_tokens),
             gen_arr(route_id, service_id, consumer_name, balancer_ip,
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_prompt_tokens", ctx))))
 
+        
metrics.llm_prompt_tokens_dist:observe(tonumber(vars.llm_prompt_tokens),
+            gen_arr(route_id, service_id, consumer_name, balancer_ip,
+                vars.request_type, vars.request_llm_model, vars.llm_model,
+                unpack(extra_labels("llm_prompt_tokens_dist", ctx))))
+
         metrics.llm_completion_tokens:inc(tonumber(vars.llm_completion_tokens),
             gen_arr(route_id, service_id, consumer_name, balancer_ip,
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_completion_tokens", ctx))))
+
+        
metrics.llm_completion_tokens_dist:observe(tonumber(vars.llm_completion_tokens),
+            gen_arr(route_id, service_id, consumer_name, balancer_ip,
+                vars.request_type, vars.request_llm_model, vars.llm_model,
+                unpack(extra_labels("llm_completion_tokens_dist", ctx))))
     end
 end
 
diff --git a/conf/config.yaml.example b/conf/config.yaml.example
index 58052cb3d..e26dbd5a3 100644
--- a/conf/config.yaml.example
+++ b/conf/config.yaml.example
@@ -674,6 +674,14 @@ plugin_attr:          # Plugin attributes
     #   - 100
     #   - 200
     #   - 500
+    # llm_prompt_tokens_buckets:      # buckets for 
apisix_llm_prompt_tokens_dist histogram (unit: token)
+    #   - 100
+    #   - 1000
+    #   - 10000
+    # llm_completion_tokens_buckets:  # buckets for 
apisix_llm_completion_tokens_dist histogram (unit: token)
+    #   - 100
+    #   - 1000
+    #   - 10000
   server-info:                        # Plugin: server-info
     report_ttl: 60                    # Set the TTL in seconds for server info 
in etcd.
                                       # Maximum: 86400. Minimum: 3.
diff --git a/docs/en/latest/plugins/prometheus.md 
b/docs/en/latest/plugins/prometheus.md
index 33d87ed80..2f92041fe 100644
--- a/docs/en/latest/plugins/prometheus.md
+++ b/docs/en/latest/plugins/prometheus.md
@@ -157,8 +157,14 @@ The following labels are used to differentiate 
`apisix_bandwidth` metrics.
 
 ### Labels for `apisix_llm_latency`
 
+The `type` label distinguishes the kind of latency, similar to 
`apisix_http_latency`:
+
+- `total`: the full response latency, recorded for both `ai_chat` and 
`ai_stream` requests.
+- `ttft`: the time to first token, recorded for `ai_stream` requests only 
(non-streaming responses do not expose a first-token moment).
+
 | Name | Description                                                           
                                                        |
 | ---------- | 
-----------------------------------------------------------------------------------------------------------------------------
 |                                                                              
               |
+| type          | Kind of latency: `total` or `ttft`.                          
                                                                   |
 | route_id      | ID of the Route that bandwidth corresponds to when 
`prefer_name` is `false` (default), and name of the Route when `prefer_name` to 
`true`. Default to an empty string if a request does not match any Route.       
                  |
 | service_id    | ID of the Service that bandwidth corresponds to when 
`prefer_name` is `false` (default), and name of the Service when `prefer_name` 
to `true`. Default to the configured value of host on the Route if the matched 
Route does not belong to any Service. |
 | consumer   | Name of the Consumer associated with a request. Default to an 
empty string if no Consumer is associated with the request.                     
  |
@@ -203,6 +209,32 @@ The following labels are used to differentiate 
`apisix_bandwidth` metrics.
 | request_type       | traditional_http / ai_chat / ai_stream                  
                                                                        |
 | llm_model       | For non-traditional_http requests, name of the llm_model   
                                                                                
       |
 
+### Labels for `apisix_llm_prompt_tokens_dist`
+
+`apisix_llm_prompt_tokens_dist` is a histogram of prompt tokens consumed per 
request, complementing the `apisix_llm_prompt_tokens` counter with a 
distribution so that quantiles (such as p95 prompt size) can be computed.
+
+| Name | Description                                                           
                                                        |
+| ---------- | 
-----------------------------------------------------------------------------------------------------------------------------
 |
+| route_id      | ID of the Route that the metric corresponds to when 
`prefer_name` is `false` (default), and name of the Route when `prefer_name` to 
`true`. Default to an empty string if a request does not match any Route.       
                  |
+| service_id    | ID of the Service that the metric corresponds to when 
`prefer_name` is `false` (default), and name of the Service when `prefer_name` 
to `true`. Default to the configured value of host on the Route if the matched 
Route does not belong to any Service. |
+| consumer   | Name of the Consumer associated with a request. Default to an 
empty string if no Consumer is associated with the request.                     
  |
+| node       | IP address of the upstream node.                                
                                                          |
+| request_type       | traditional_http / ai_chat / ai_stream                  
                                                                        |
+| llm_model       | For non-traditional_http requests, name of the llm_model   
                                                                                
       |
+
+### Labels for `apisix_llm_completion_tokens_dist`
+
+`apisix_llm_completion_tokens_dist` is a histogram of completion tokens 
generated per request, complementing the `apisix_llm_completion_tokens` counter 
with a distribution.
+
+| Name | Description                                                           
                                                        |
+| ---------- | 
-----------------------------------------------------------------------------------------------------------------------------
 |
+| route_id      | ID of the Route that the metric corresponds to when 
`prefer_name` is `false` (default), and name of the Route when `prefer_name` to 
`true`. Default to an empty string if a request does not match any Route.       
                  |
+| service_id    | ID of the Service that the metric corresponds to when 
`prefer_name` is `false` (default), and name of the Service when `prefer_name` 
to `true`. Default to the configured value of host on the Route if the matched 
Route does not belong to any Service. |
+| consumer   | Name of the Consumer associated with a request. Default to an 
empty string if no Consumer is associated with the request.                     
  |
+| node       | IP address of the upstream node.                                
                                                          |
+| request_type       | traditional_http / ai_chat / ai_stream                  
                                                                        |
+| llm_model       | For non-traditional_http requests, name of the llm_model   
                                                                                
       |
+
 ### Labels for `apisix_http_latency`
 
 The following labels are used to differentiate `apisix_http_latency` metrics.
diff --git a/docs/zh/latest/plugins/prometheus.md 
b/docs/zh/latest/plugins/prometheus.md
index 2cc46005a..ff3963a6e 100644
--- a/docs/zh/latest/plugins/prometheus.md
+++ b/docs/zh/latest/plugins/prometheus.md
@@ -157,8 +157,14 @@ Prometheus 中有不同类型的指标。要了解它们之间的区别，请参
 
 ### `apisix_llm_latency` 的标签
 
+`type` 标签用于区分延迟类型，与 `apisix_http_latency` 类似：
+
+- `total`：完整的响应延迟，`ai_chat` 和 `ai_stream` 请求都会记录。
+- `ttft`：首个 token 到达时间，仅 `ai_stream` 请求记录（非流式响应没有"首个 token"这一时刻）。
+
 | 名称 | 描述 |
 | ---------- | 
-----------------------------------------------------------------------------------------------------------------------------
 |
+| type          | 延迟类型：`total` 或 `ttft`。                                       
                                                             |
 | route_id      | 请求对应的路由 ID，当 `prefer_name` 为 `false`（默认）时，使用路由 ID，当 
`prefer_name` 为 `true` 时，使用路由名称。如果请求不匹配任何路由，则默认为空字符串。                        |
 | service_id    | 请求对应的服务 ID，当 `prefer_name` 为 `false`（默认）时，使用服务 ID，当 
`prefer_name` 为 `true` 时，使用服务名称。如果匹配的路由不属于任何服务，则默认为路由上配置的主机值。 |
 | consumer   | 与请求关联的消费者名称。如果请求没有与之关联的消费者，则默认为空字符串。                       |
@@ -203,6 +209,32 @@ Prometheus 中有不同类型的指标。要了解它们之间的区别，请参
 | request_type       | traditional_http / ai_chat / ai_stream                  
                                                                        |
 | llm_model       | 对于非传统的 http 请求，llm 模型的名称                                   
                                                       |
 
+### `apisix_llm_prompt_tokens_dist` 的标签
+
+`apisix_llm_prompt_tokens_dist` 是每次请求消耗的 prompt token 数的直方图，作为 
`apisix_llm_prompt_tokens` 计数器的补充，提供分布信息以便计算分位数（如 p95 prompt 大小）。
+
+| 名称 | 描述 |
+| ---------- | 
-----------------------------------------------------------------------------------------------------------------------------
 |
+| route_id      | 请求对应的路由 ID，当 `prefer_name` 为 `false`（默认）时，使用路由 ID，当 
`prefer_name` 为 `true` 时，使用路由名称。如果请求不匹配任何路由，则默认为空字符串。                         |
+| service_id    | 请求对应的服务 ID，当 `prefer_name` 为 `false`（默认）时，使用服务 ID，当 
`prefer_name` 为 `true` 时，使用服务名称。如果匹配的路由不属于任何服务，则默认为路由上配置的主机值。 |
+| consumer   | 与请求关联的消费者名称。如果请求没有与之关联的消费者，则默认为空字符串。                       |
+| node       | 上游节点的 IP 地址。                                                    
                                      |
+| request_type       | traditional_http / ai_chat / ai_stream                  
                                                                        |
+| llm_model       | 对于非传统的 http 请求，llm 模型的名称                                   
                                                       |
+
+### `apisix_llm_completion_tokens_dist` 的标签
+
+`apisix_llm_completion_tokens_dist` 是每次请求生成的 completion token 数的直方图，作为 
`apisix_llm_completion_tokens` 计数器的补充，提供分布信息。
+
+| 名称 | 描述 |
+| ---------- | 
-----------------------------------------------------------------------------------------------------------------------------
 |
+| route_id      | 请求对应的路由 ID，当 `prefer_name` 为 `false`（默认）时，使用路由 ID，当 
`prefer_name` 为 `true` 时，使用路由名称。如果请求不匹配任何路由，则默认为空字符串。                         |
+| service_id    | 请求对应的服务 ID，当 `prefer_name` 为 `false`（默认）时，使用服务 ID，当 
`prefer_name` 为 `true` 时，使用服务名称。如果匹配的路由不属于任何服务，则默认为路由上配置的主机值。 |
+| consumer   | 与请求关联的消费者名称。如果请求没有与之关联的消费者，则默认为空字符串。                       |
+| node       | 上游节点的 IP 地址。                                                    
                                      |
+| request_type       | traditional_http / ai_chat / ai_stream                  
                                                                        |
+| llm_model       | 对于非传统的 http 请求，llm 模型的名称                                   
                                                       |
+
 ### `apisix_http_latency` 的标签
 
 以下标签用于区分 `apisix_http_latency` 指标。
diff --git a/t/plugin/prometheus-ai-proxy.t b/t/plugin/prometheus-ai-proxy.t
index d46869623..9a641825b 100644
--- a/t/plugin/prometheus-ai-proxy.t
+++ b/t/plugin/prometheus-ai-proxy.t
@@ -269,3 +269,166 @@ 
qr/apisix_llm_active_connections\{.*route_id="1",.*,node="openai-gpt4".*.*reques
 GET /t
 --- response_body
 success
+
+
+
+=== TEST 11: create a non-streaming route for token distribution histograms
+--- config
+    location /t {
+        content_by_lua_block {
+            local data = {
+                {
+                    url = "/apisix/admin/routes/3",
+                    data = [[{
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy-multi": {
+                                "instances": [
+                                    {
+                                        "name": "openai-gpt4",
+                                        "provider": "openai",
+                                        "weight": 1,
+                                        "auth": {
+                                            "header": {
+                                                "Authorization": "Bearer token"
+                                            }
+                                        },
+                                        "options": {
+                                            "model": "gpt-4"
+                                        },
+                                        "override": {
+                                            "endpoint": "http://127.0.0.1:1980";
+                                        }
+                                    }
+                                ]
+                            }
+                        },
+                        "uri": "/chat-dist"
+                    }]],
+                },
+            }
+            local t = require("lib.test_admin").test
+            for _, data in ipairs(data) do
+                local _, body = t(data.url, ngx.HTTP_PUT, data.data)
+                ngx.say(body)
+            end
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 12: send a non-streaming chat request
+--- request
+POST /chat-dist
+{"messages":[{"role":"user","content":"What is 1+1?"}], "model": "gpt-3"}
+--- more_headers
+X-AI-Fixture: prometheus/chat-basic.json
+--- error_code: 200
+
+
+
+=== TEST 13: assert llm_prompt_tokens_dist_count metric
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_prompt_tokens_dist_count\{.*route_id="3",.*,node="openai-gpt4".*request_type="ai_chat",request_llm_model="gpt-3",llm_model="gpt-4"\}
 1/
+
+
+
+=== TEST 14: assert llm_completion_tokens_dist_count metric
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_completion_tokens_dist_count\{.*route_id="3",.*,node="openai-gpt4".*request_type="ai_chat",request_llm_model="gpt-3",llm_model="gpt-4"\}
 1/
+
+
+
+=== TEST 15: llm_latency type=ttft is not recorded for non-streaming requests
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_unlike eval
+qr/apisix_llm_latency_count\{type="ttft",.*route_id="3"/
+
+
+
+=== TEST 16: create a streaming route for the TTFT histogram
+--- config
+    location /t {
+        content_by_lua_block {
+            local data = {
+                {
+                    url = "/apisix/admin/routes/4",
+                    data = [[{
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy-multi": {
+                                "instances": [
+                                    {
+                                        "name": "openai-gpt4",
+                                        "provider": "openai",
+                                        "weight": 1,
+                                        "auth": {
+                                            "header": {
+                                                "Authorization": "Bearer token"
+                                            }
+                                        },
+                                        "options": {
+                                            "model": "gpt-4"
+                                        },
+                                        "override": {
+                                            "endpoint": "http://127.0.0.1:1980";
+                                        }
+                                    }
+                                ]
+                            }
+                        },
+                        "uri": "/chat-stream"
+                    }]],
+                },
+            }
+            local t = require("lib.test_admin").test
+            for _, data in ipairs(data) do
+                local _, body = t(data.url, ngx.HTTP_PUT, data.data)
+                ngx.say(body)
+            end
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 17: send a streaming chat request
+--- request
+POST /chat-stream
+{"messages":[{"role":"user","content":"What is 1+1?"}], "model": "gpt-3", 
"stream": true}
+--- more_headers
+X-AI-Fixture: openai/chat-streaming.sse
+--- response_headers_like
+Content-Type: text/event-stream
+
+
+
+=== TEST 18: assert llm_latency type=ttft count for the streaming request
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_latency_count\{type="ttft",.*route_id="4",.*,node="openai-gpt4".*request_type="ai_stream",request_llm_model="gpt-3",llm_model="gpt-4"\}
 1/
+
+
+
+=== TEST 19: assert llm_latency type=ttft bucket for the streaming request
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_latency_bucket\{type="ttft",.*route_id="4",.*,node="openai-gpt4".*request_type="ai_stream",request_llm_model="gpt-3",llm_model="gpt-4",le="\d+"\}
 1/
+
+
+
+=== TEST 20: assert llm_latency type=total is also recorded for the streaming 
request
+--- request
+GET /apisix/prometheus/metrics
+--- response_body eval
+qr/apisix_llm_latency_count\{type="total",.*route_id="4",.*,node="openai-gpt4".*request_type="ai_stream",request_llm_model="gpt-3",llm_model="gpt-4"\}
 1/

(apisix) branch master updated: feat(prometheus): add built-in LLM histograms for TTFT and token distribution (#13487)

Reply via email to