(apisix) branch master updated: feat(ai-proxy): add built-in nginx variables for LLM observability (#13477)

alinsran Wed, 10 Jun 2026 02:22:39 -0700

This is an automated email from the ASF dual-hosted git repository.

AlinsRan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git



The following commit(s) were added to refs/heads/master by this push:
     new bf9091acd feat(ai-proxy): add built-in nginx variables for LLM 
observability (#13477)
bf9091acd is described below

commit bf9091acd3ec65bd8fd90e24e937131279fdcbce
Author: AlinsRan <[email protected]>
AuthorDate: Wed Jun 10 17:22:25 2026 +0800

    feat(ai-proxy): add built-in nginx variables for LLM observability (#13477)
---
 apisix/cli/ngx_tpl.lua                             |   8 +
 apisix/core/ctx.lua                                |   8 +
 apisix/plugins/ai-protocols/anthropic-messages.lua |  41 ++++
 apisix/plugins/ai-protocols/openai-chat.lua        |  68 ++++++-
 apisix/plugins/ai-protocols/openai-responses.lua   |  69 +++++--
 apisix/plugins/ai-providers/base.lua               |  26 +++
 apisix/plugins/ai-proxy/base.lua                   |  33 +++-
 t/APISIX.pm                                        |  10 +-
 .../anthropic/messages-streaming-with-tool-use.sse |  14 ++
 t/fixtures/openai/chat-streaming-with-cache.sse    |   6 +
 .../openai/chat-streaming-with-tool-calls.sse      |   7 +
 .../openai/responses-streaming-with-cache.sse      |   6 +
 .../openai/responses-streaming-with-tool-call.sse  |   3 +
 t/fixtures/openai/responses-with-cache.json        |  22 +++
 t/plugin/ai-proxy-anthropic.t                      |  89 +++++++++
 t/plugin/ai-proxy3.t                               | 219 +++++++++++++++++++--
 16 files changed, 596 insertions(+), 33 deletions(-)

diff --git a/apisix/cli/ngx_tpl.lua b/apisix/cli/ngx_tpl.lua
index ca6642b9a..c86f83e70 100644
--- a/apisix/cli/ngx_tpl.lua
+++ b/apisix/cli/ngx_tpl.lua
@@ -830,6 +830,14 @@ http {
             set $llm_model                      '';
             set $llm_prompt_tokens              '0';
             set $llm_completion_tokens          '0';
+            set $llm_total_tokens               '0';
+            set $llm_stream                     'false';
+            set $llm_has_tool_calls             'false';
+            set $llm_tool_count                 '0';
+            set $llm_end_user_id                '';
+            set $llm_cache_read_input_tokens    '0';
+            set $llm_cache_creation_input_tokens '0';
+            set $llm_reasoning_tokens           '0';
 
 
             {% if use_apisix_base then %}
diff --git a/apisix/core/ctx.lua b/apisix/core/ctx.lua
index 07b30d7dd..58ee7c6b6 100644
--- a/apisix/core/ctx.lua
+++ b/apisix/core/ctx.lua
@@ -204,6 +204,14 @@ do
         llm_model                  = true,
         llm_prompt_tokens          = true,
         llm_completion_tokens      = true,
+        llm_total_tokens                = true,
+        llm_stream                      = true,
+        llm_has_tool_calls              = true,
+        llm_tool_count                  = true,
+        llm_end_user_id                 = true,
+        llm_cache_read_input_tokens     = true,
+        llm_cache_creation_input_tokens = true,
+        llm_reasoning_tokens            = true,
 
         upstream_mirror_host       = true,
         upstream_mirror_uri        = true,
diff --git a/apisix/plugins/ai-protocols/anthropic-messages.lua 
b/apisix/plugins/ai-protocols/anthropic-messages.lua
index a78d0c26e..a777f5136 100644
--- a/apisix/plugins/ai-protocols/anthropic-messages.lua
+++ b/apisix/plugins/ai-protocols/anthropic-messages.lua
@@ -66,6 +66,14 @@ function _M.parse_sse_event(event, ctx, state)
         end
         return { type = "skip" }
 
+    elseif event.type == "content_block_start" then
+        local data = core.json.decode(event.data, { null_as_nil = true })
+        if data and type(data.content_block) == "table"
+                and data.content_block.type == "tool_use" then
+            return { type = "skip", has_tool_call = true }
+        end
+        return { type = "skip" }
+
     elseif event.type == "message_delta" then
         local data, err = core.json.decode(event.data, { null_as_nil = true })
         if not data then
@@ -102,6 +110,9 @@ function _M.parse_sse_event(event, ctx, state)
                     prompt_tokens = usage.input_tokens or 0,
                     completion_tokens = usage.output_tokens or 0,
                     total_tokens = (usage.input_tokens or 0) + 
(usage.output_tokens or 0),
+                    cache_read_input_tokens = usage.cache_read_input_tokens or 
0,
+                    cache_creation_input_tokens = 
usage.cache_creation_input_tokens or 0,
+                    reasoning_tokens = 0,
                 },
                 raw_usage = usage,
             }
@@ -169,10 +180,40 @@ function _M.extract_usage(res_body)
         prompt_tokens = prompt,
         completion_tokens = completion,
         total_tokens = prompt + completion,
+        cache_read_input_tokens = raw.cache_read_input_tokens or 0,
+        cache_creation_input_tokens = raw.cache_creation_input_tokens or 0,
+        reasoning_tokens = 0,
     }, raw
 end
 
 
+--- Detect whether a non-streaming response contains tool calls.
+function _M.has_tool_call(res_body)
+    if type(res_body) ~= "table" or type(res_body.content) ~= "table" then
+        return false
+    end
+    for _, block in ipairs(res_body.content) do
+        if type(block) == "table" and block.type == "tool_use" then
+            return true
+        end
+    end
+    return false
+end
+
+
+--- Extract the end-user identifier from a request body.
+function _M.extract_end_user_id(body)
+    if type(body) ~= "table" then
+        return nil
+    end
+    local meta = body.metadata
+    if type(meta) == "table" and type(meta.user_id) == "string" then
+        return meta.user_id
+    end
+    return nil
+end
+
+
 --- Extract all text content from a request body for moderation.
 function _M.extract_request_content(body)
     local contents = {}
diff --git a/apisix/plugins/ai-protocols/openai-chat.lua 
b/apisix/plugins/ai-protocols/openai-chat.lua
index faa48f665..096af76df 100644
--- a/apisix/plugins/ai-protocols/openai-chat.lua
+++ b/apisix/plugins/ai-protocols/openai-chat.lua
@@ -79,14 +79,18 @@ function _M.parse_sse_event(event, ctx, state)
 
         local result = { type = "delta", data = data }
 
-        -- Extract text content from choices
+        -- Extract text content and detect tool calls from choices
         if type(data.choices) == "table" and #data.choices > 0 then
             local texts = {}
             for _, choice in ipairs(data.choices) do
-                if type(choice) == "table"
-                        and type(choice.delta) == "table"
-                        and type(choice.delta.content) == "string" then
-                    core.table.insert(texts, choice.delta.content)
+                if type(choice) == "table" and type(choice.delta) == "table" 
then
+                    if type(choice.delta.content) == "string" then
+                        core.table.insert(texts, choice.delta.content)
+                    end
+                    if type(choice.delta.tool_calls) == "table"
+                            and #choice.delta.tool_calls > 0 then
+                        result.has_tool_call = true
+                    end
                 end
             end
             if #texts > 0 then
@@ -96,13 +100,20 @@ function _M.parse_sse_event(event, ctx, state)
 
         -- Extract usage (null for non-final chunks; cjson decodes null as 
userdata)
         if type(data.usage) == "table" then
+            local u = data.usage
+            local pd = type(u.prompt_tokens_details) == "table" and 
u.prompt_tokens_details
+            local cd = type(u.completion_tokens_details) == "table" and 
u.completion_tokens_details
             result.type = "usage"
             result.usage = {
-                prompt_tokens = data.usage.prompt_tokens or 0,
-                completion_tokens = data.usage.completion_tokens or 0,
-                total_tokens = data.usage.total_tokens or 0,
+                prompt_tokens = u.prompt_tokens or 0,
+                completion_tokens = u.completion_tokens or 0,
+                total_tokens = u.total_tokens or 0,
+                cache_read_input_tokens = pd and pd.cached_tokens
+                                          or u.prompt_cache_hit_tokens or 0,
+                cache_creation_input_tokens = pd and 
pd.cache_creation_input_tokens or 0,
+                reasoning_tokens = cd and cd.reasoning_tokens or 0,
             }
-            result.raw_usage = data.usage
+            result.raw_usage = u
         end
 
         return result
@@ -160,14 +171,53 @@ function _M.extract_usage(res_body)
         return nil, nil
     end
     local raw = res_body.usage
+    local pdetails = type(raw.prompt_tokens_details) == "table" and 
raw.prompt_tokens_details
+    local cdetails = type(raw.completion_tokens_details) == "table"
+                     and raw.completion_tokens_details
+    -- OpenAI uses prompt_tokens_details.cached_tokens; DeepSeek uses 
prompt_cache_hit_tokens
+    local cache_read = pdetails and pdetails.cached_tokens or 
raw.prompt_cache_hit_tokens or 0
     return {
         prompt_tokens = raw.prompt_tokens or 0,
         completion_tokens = raw.completion_tokens or 0,
         total_tokens = raw.total_tokens or (raw.prompt_tokens or 0) + 
(raw.completion_tokens or 0),
+        cache_read_input_tokens = cache_read,
+        cache_creation_input_tokens = pdetails and 
pdetails.cache_creation_input_tokens or 0,
+        reasoning_tokens = cdetails and cdetails.reasoning_tokens or 0,
     }, raw
 end
 
 
+--- Detect whether a non-streaming response contains tool calls.
+function _M.has_tool_call(res_body)
+    if type(res_body) ~= "table" or type(res_body.choices) ~= "table" then
+        return false
+    end
+    for _, choice in ipairs(res_body.choices) do
+        if type(choice) == "table" and type(choice.message) == "table"
+                and type(choice.message.tool_calls) == "table"
+                and #choice.message.tool_calls > 0 then
+            return true
+        end
+    end
+    return false
+end
+
+
+--- Extract the end-user identifier from a request body.
+function _M.extract_end_user_id(body)
+    if type(body) ~= "table" then
+        return nil
+    end
+    if type(body.safety_identifier) == "string" then
+        return body.safety_identifier
+    end
+    if type(body.user) == "string" then
+        return body.user
+    end
+    return nil
+end
+
+
 --- Extract all text content from a request body for moderation.
 function _M.extract_request_content(body)
     local contents = {}
diff --git a/apisix/plugins/ai-protocols/openai-responses.lua 
b/apisix/plugins/ai-protocols/openai-responses.lua
index 9e05e99a7..9e78b3fb5 100644
--- a/apisix/plugins/ai-protocols/openai-responses.lua
+++ b/apisix/plugins/ai-protocols/openai-responses.lua
@@ -68,16 +68,30 @@ function _M.parse_sse_event(event, ctx, state)
             core.log.warn("failed to decode response.completed SSE data: ", 
err)
             return result
         end
-        if type(data.response) == "table"
-                and type(data.response.usage) == "table" then
-            local usage = data.response.usage
-            result.type = "usage_and_done"
-            result.usage = {
-                prompt_tokens = usage.input_tokens or 0,
-                completion_tokens = usage.output_tokens or 0,
-                total_tokens = usage.total_tokens or 0,
-            }
-            result.raw_usage = usage
+        if type(data.response) == "table" then
+            local resp = data.response
+            if type(resp.usage) == "table" then
+                local usage = resp.usage
+                result.type = "usage_and_done"
+                result.usage = {
+                    prompt_tokens = usage.input_tokens or 0,
+                    completion_tokens = usage.output_tokens or 0,
+                    total_tokens = usage.total_tokens or 0,
+                    cache_read_input_tokens = type(usage.input_tokens_details) 
== "table"
+                        and usage.input_tokens_details.cached_tokens or 0,
+                    reasoning_tokens = type(usage.output_tokens_details) == 
"table"
+                        and usage.output_tokens_details.reasoning_tokens or 0,
+                }
+                result.raw_usage = usage
+            end
+            if type(resp.output) == "table" then
+                for _, item in ipairs(resp.output) do
+                    if type(item) == "table" and item.type == "function_call" 
then
+                        result.has_tool_call = true
+                        break
+                    end
+                end
+            end
         end
         return result
 
@@ -135,17 +149,50 @@ function _M.extract_usage(res_body)
         return nil, nil
     end
     local raw = res_body.usage
-    -- Responses API uses input_tokens / output_tokens
+    local idetails = type(raw.input_tokens_details) == "table" and 
raw.input_tokens_details
+    local odetails = type(raw.output_tokens_details) == "table" and 
raw.output_tokens_details
     local prompt = raw.input_tokens or 0
     local completion = raw.output_tokens or 0
     return {
         prompt_tokens = prompt,
         completion_tokens = completion,
         total_tokens = raw.total_tokens or (prompt + completion),
+        cache_read_input_tokens = idetails and idetails.cached_tokens or 0,
+        cache_creation_input_tokens = 0,
+        reasoning_tokens = odetails and odetails.reasoning_tokens or 0,
     }, raw
 end
 
 
+--- Detect whether a non-streaming response contains tool calls.
+function _M.has_tool_call(res_body)
+    if type(res_body) ~= "table" or type(res_body.output) ~= "table" then
+        return false
+    end
+    for _, item in ipairs(res_body.output) do
+        if type(item) == "table" and item.type == "function_call" then
+            return true
+        end
+    end
+    return false
+end
+
+
+--- Extract the end-user identifier from a request body.
+function _M.extract_end_user_id(body)
+    if type(body) ~= "table" then
+        return nil
+    end
+    if type(body.safety_identifier) == "string" then
+        return body.safety_identifier
+    end
+    if type(body.user) == "string" then
+        return body.user
+    end
+    return nil
+end
+
+
 --- Extract all text content from a request body for moderation.
 function _M.extract_request_content(body)
     local contents = {}
diff --git a/apisix/plugins/ai-providers/base.lua 
b/apisix/plugins/ai-providers/base.lua
index 407fe0bb8..3021b89cd 100644
--- a/apisix/plugins/ai-providers/base.lua
+++ b/apisix/plugins/ai-providers/base.lua
@@ -77,6 +77,13 @@ local function merge_usage(ctx, parsed)
                 ctx.ai_token_usage[k] = v
             end
         end
+        -- Recompute total from accumulated parts (handles split events, e.g. 
Anthropic
+        -- message_start carries input tokens and message_delta carries output 
tokens)
+        local computed = (ctx.ai_token_usage.prompt_tokens or 0)
+                       + (ctx.ai_token_usage.completion_tokens or 0)
+        if computed > (ctx.ai_token_usage.total_tokens or 0) then
+            ctx.ai_token_usage.total_tokens = computed
+        end
     end
 
     local raw = parsed.raw_usage or parsed.usage
@@ -396,12 +403,22 @@ function _M.parse_response(self, ctx, res, client_proto, 
converter, conf)
     end
     ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens or 0
     ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens or 0
+    ctx.var.llm_total_tokens = ctx.ai_token_usage.total_tokens or 0
+    ctx.var.llm_cache_read_input_tokens = 
ctx.ai_token_usage.cache_read_input_tokens or 0
+    ctx.var.llm_cache_creation_input_tokens = 
ctx.ai_token_usage.cache_creation_input_tokens or 0
+    ctx.var.llm_reasoning_tokens = ctx.ai_token_usage.reasoning_tokens or 0
 
     local response_text = client_proto.extract_response_text(res_body)
     if response_text then
         ctx.var.llm_response_text = response_text
     end
 
+    -- Detect tool calls (mirrors the streaming path, which sets this from
+    -- parse_sse_event.has_tool_call). Each client protocol knows its own 
shape.
+    if client_proto.has_tool_call and client_proto.has_tool_call(res_body) then
+        ctx.var.llm_has_tool_calls = "true"
+    end
+
     plugin.lua_response_filter(ctx, headers, raw_res_body)
     return res_body
 end
@@ -591,6 +608,9 @@ function _M.parse_streaming_response(self, ctx, res, 
target_proto, converter, co
         for _, event in ipairs(events) do
             -- Target protocol parses the provider's SSE format
             local parsed = target_proto.parse_sse_event(event, ctx, sse_state)
+            if parsed and parsed.has_tool_call then
+                ctx.var.llm_has_tool_calls = "true"
+            end
             if not parsed or parsed.type == "skip" then
                 goto CONTINUE
             end
@@ -618,6 +638,12 @@ function _M.parse_streaming_response(self, ctx, res, 
target_proto, converter, co
                 merge_usage(ctx, parsed)
                 ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens
                 ctx.var.llm_completion_tokens = 
ctx.ai_token_usage.completion_tokens
+                ctx.var.llm_total_tokens = ctx.ai_token_usage.total_tokens or 0
+                ctx.var.llm_cache_read_input_tokens =
+                    ctx.ai_token_usage.cache_read_input_tokens or 0
+                ctx.var.llm_cache_creation_input_tokens =
+                    ctx.ai_token_usage.cache_creation_input_tokens or 0
+                ctx.var.llm_reasoning_tokens = 
ctx.ai_token_usage.reasoning_tokens or 0
                 ctx.var.llm_response_text = table.concat(contents, "")
             end
 
diff --git a/apisix/plugins/ai-proxy/base.lua b/apisix/plugins/ai-proxy/base.lua
index d2b6e67a9..80c025f7a 100644
--- a/apisix/plugins/ai-proxy/base.lua
+++ b/apisix/plugins/ai-proxy/base.lua
@@ -32,6 +32,21 @@ local apisix_upstream = require("resty.apisix.upstream")
 local _M = {}
 
 
+-- Count tools in the final upstream request body.
+-- OpenAI Chat/Responses: body.tools array
+-- Anthropic Messages: body.tools array
+local function count_request_tools(body)
+    if type(body) ~= "table" then
+        return 0
+    end
+    local tools = body.tools
+    if type(tools) == "table" then
+        return #tools
+    end
+    return 0
+end
+
+
 local function resolve_cap(cap_entry, key, conf, ctx)
     local val = cap_entry and cap_entry[key]
     if type(val) == "function" then
@@ -48,6 +63,7 @@ function _M.set_logging(ctx, summaries, payloads)
             duration = ctx.var.llm_time_to_first_token,
             prompt_tokens = ctx.var.llm_prompt_tokens,
             completion_tokens = ctx.var.llm_completion_tokens,
+            total_tokens = ctx.var.llm_total_tokens,
             upstream_response_time = ctx.var.apisix_upstream_response_time,
         }
     end
@@ -181,6 +197,7 @@ function _M.before_proxy(conf, ctx, on_error)
         end
         ctx.ai_converter = converter
         ctx.ai_target_protocol = target_proto
+        local target_proto_module = protocols.get(target_proto)
 
         -- Step 2: Extract model from request
         local request_model = request_body.model
@@ -217,6 +234,18 @@ function _M.before_proxy(conf, ctx, on_error)
                 return 500, body
             end
 
+            -- Compute built-in AI log fields from the final upstream request
+            local final_body = params.body
+            local is_stream = ctx.var.request_type == "ai_stream"
+            ctx.var.llm_stream = is_stream and "true" or "false"
+            ctx.var.llm_tool_count = count_request_tools(final_body)
+            if target_proto_module and target_proto_module.extract_end_user_id 
then
+                local end_user = 
target_proto_module.extract_end_user_id(final_body)
+                if end_user then
+                    ctx.var.llm_end_user_id = end_user
+                end
+            end
+
             core.log.info("sending request to LLM server: ",
                           
core.json.delay_encode(log_sanitize.redact_params(params), true))
 
@@ -310,14 +339,16 @@ function _M.before_proxy(conf, ctx, on_error)
                                  "application/vnd.amazon.eventstream", 1, true)
             )
             if is_streaming_resp then
-                local target_proto_module = protocols.get(target_proto)
                 if not target_proto_module then
                     core.log.error("no protocol module for streaming target: 
", target_proto)
                     return 500
                 end
+
                 code, body = ai_provider:parse_streaming_response(
                     ctx, res, target_proto_module, converter, conf)
             else
+                -- Non-streaming: parse_response sets all llm_* token/tool vars
+                -- via the client protocol adapter.
                 local _, parse_err, parse_status = ai_provider:parse_response(
                     ctx, res, client_proto, converter, conf)
                 if parse_err then
diff --git a/t/APISIX.pm b/t/APISIX.pm
index 540c48f0e..cf342f82c 100644
--- a/t/APISIX.pm
+++ b/t/APISIX.pm
@@ -717,7 +717,7 @@ _EOC_
         require("apisix").http_exit_worker()
     }
 
-    log_format main escape=default '\$remote_addr - \$remote_user 
[\$time_local] \$http_host "\$request_line" \$status \$body_bytes_sent 
\$request_time "\$http_referer" "\$http_user_agent" \$upstream_addr 
\$upstream_status \$apisix_upstream_response_time 
"\$upstream_scheme://\$upstream_host\$upstream_uri" \$request_llm_model 
\$llm_model \$llm_time_to_first_token \$llm_prompt_tokens 
\$llm_completion_tokens "\$rate_limiting_info"';
+    log_format main escape=default '\$remote_addr - \$remote_user 
[\$time_local] \$http_host "\$request_line" \$status \$body_bytes_sent 
\$request_time "\$http_referer" "\$http_user_agent" \$upstream_addr 
\$upstream_status \$apisix_upstream_response_time 
"\$upstream_scheme://\$upstream_host\$upstream_uri" \$request_llm_model 
\$llm_model \$llm_time_to_first_token \$llm_prompt_tokens 
\$llm_completion_tokens \$llm_total_tokens \$llm_stream \$llm_has_tool_calls 
\$llm_tool_count \$llm_end_use [...]
 
     # fake server, only for test
     server {
@@ -923,6 +923,14 @@ _EOC_
             set \$llm_model                      '';
             set \$llm_prompt_tokens              '0';
             set \$llm_completion_tokens          '0';
+            set \$llm_total_tokens               '0';
+            set \$llm_stream                     'false';
+            set \$llm_has_tool_calls             'false';
+            set \$llm_tool_count                 '0';
+            set \$llm_end_user_id                '';
+            set \$llm_cache_read_input_tokens    '0';
+            set \$llm_cache_creation_input_tokens '0';
+            set \$llm_reasoning_tokens           '0';
 
             set \$apisix_upstream_response_time  \$upstream_response_time;
             access_log $apisix_home/t/servroot/logs/access.log main;
diff --git a/t/fixtures/anthropic/messages-streaming-with-tool-use.sse 
b/t/fixtures/anthropic/messages-streaming-with-tool-use.sse
new file mode 100644
index 000000000..425626a07
--- /dev/null
+++ b/t/fixtures/anthropic/messages-streaming-with-tool-use.sse
@@ -0,0 +1,14 @@
+event: message_start
+data: 
{"type":"message_start","message":{"id":"msg_123","type":"message","role":"assistant","content":[],"usage":{"input_tokens":20,"output_tokens":0}}}
+
+event: content_block_start
+data: 
{"type":"content_block_start","index":0,"content_block":{"type":"tool_use","id":"toolu_1","name":"get_weather"}}
+
+event: content_block_stop
+data: {"type":"content_block_stop","index":0}
+
+event: message_delta
+data: 
{"type":"message_delta","delta":{"stop_reason":"tool_use"},"usage":{"output_tokens":5}}
+
+event: message_stop
+data: {}
diff --git a/t/fixtures/openai/chat-streaming-with-cache.sse 
b/t/fixtures/openai/chat-streaming-with-cache.sse
new file mode 100644
index 000000000..11757f645
--- /dev/null
+++ b/t/fixtures/openai/chat-streaming-with-cache.sse
@@ -0,0 +1,6 @@
+data: 
{"id":"chatcmpl-cache1","object":"chat.completion.chunk","choices":[{"delta":{"role":"assistant","content":"Hi"},"index":0,"finish_reason":null}],"usage":null}
+
+data: 
{"id":"chatcmpl-cache1","object":"chat.completion.chunk","choices":[{"delta":{},"index":0,"finish_reason":"stop"}],"usage":{"prompt_tokens":30,"completion_tokens":15,"total_tokens":45,"prompt_tokens_details":{"cached_tokens":10,"cache_creation_input_tokens":5},"completion_tokens_details":{"reasoning_tokens":7}}}
+
+data: [DONE]
+
diff --git a/t/fixtures/openai/chat-streaming-with-tool-calls.sse 
b/t/fixtures/openai/chat-streaming-with-tool-calls.sse
new file mode 100644
index 000000000..e17710263
--- /dev/null
+++ b/t/fixtures/openai/chat-streaming-with-tool-calls.sse
@@ -0,0 +1,7 @@
+data: 
{"id":"chatcmpl-1","object":"chat.completion.chunk","choices":[{"delta":{"role":"assistant","tool_calls":[{"index":0,"id":"call_1","type":"function","function":{"name":"get_weather","arguments":""}}]},"index":0,"finish_reason":null}],"usage":null}
+
+data: 
{"id":"chatcmpl-1","object":"chat.completion.chunk","choices":[{"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{}"}}]},"index":0,"finish_reason":"tool_calls"}],"usage":null}
+
+data: 
{"id":"chatcmpl-1","object":"chat.completion.chunk","choices":[{"delta":{},"index":0,"finish_reason":null}],"usage":{"prompt_tokens":15,"completion_tokens":10,"total_tokens":25}}
+
+data: [DONE]
diff --git a/t/fixtures/openai/responses-streaming-with-cache.sse 
b/t/fixtures/openai/responses-streaming-with-cache.sse
new file mode 100644
index 000000000..67953da93
--- /dev/null
+++ b/t/fixtures/openai/responses-streaming-with-cache.sse
@@ -0,0 +1,6 @@
+event: response.output_text.delta
+data: {"type":"response.output_text.delta","delta":"Hello"}
+
+event: response.completed
+data: 
{"type":"response.completed","response":{"output":[{"type":"message","content":[{"type":"output_text","text":"Hello"}]}],"usage":{"input_tokens":20,"output_tokens":5,"total_tokens":25,"input_tokens_details":{"cached_tokens":10},"output_tokens_details":{"reasoning_tokens":3}}}}
+
diff --git a/t/fixtures/openai/responses-streaming-with-tool-call.sse 
b/t/fixtures/openai/responses-streaming-with-tool-call.sse
new file mode 100644
index 000000000..4c9dbc125
--- /dev/null
+++ b/t/fixtures/openai/responses-streaming-with-tool-call.sse
@@ -0,0 +1,3 @@
+event: response.completed
+data: 
{"type":"response.completed","response":{"output":[{"type":"function_call","id":"call_1","name":"get_weather","arguments":"{}"}],"usage":{"input_tokens":20,"output_tokens":5,"total_tokens":25}}}
+
diff --git a/t/fixtures/openai/responses-with-cache.json 
b/t/fixtures/openai/responses-with-cache.json
new file mode 100644
index 000000000..39c72846f
--- /dev/null
+++ b/t/fixtures/openai/responses-with-cache.json
@@ -0,0 +1,22 @@
+{
+  "id": "resp_cache1",
+  "object": "response",
+  "created_at": 1723780938,
+  "model": "{{model}}",
+  "output": [
+    {
+      "type": "message",
+      "role": "assistant",
+      "content": [
+        { "type": "output_text", "text": "Hello" }
+      ]
+    }
+  ],
+  "usage": {
+    "input_tokens": 40,
+    "output_tokens": 20,
+    "total_tokens": 60,
+    "input_tokens_details": { "cached_tokens": 12 },
+    "output_tokens_details": { "reasoning_tokens": 8 }
+  }
+}
diff --git a/t/plugin/ai-proxy-anthropic.t b/t/plugin/ai-proxy-anthropic.t
index e5912d192..67e435bbb 100644
--- a/t/plugin/ai-proxy-anthropic.t
+++ b/t/plugin/ai-proxy-anthropic.t
@@ -36,6 +36,7 @@ add_block_preprocessor(sub {
 
     my $user_yaml_config = <<_EOC_;
 plugins:
+  - ai-proxy
   - ai-proxy-multi
 _EOC_
     $block->set_value("extra_yaml_config", $user_yaml_config);
@@ -1666,3 +1667,91 @@ OK
 OK
 --- no_error_log
 [error]
+
+
+
+=== TEST 47: set route for native Anthropic protocol built-in var tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/v1/messages",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "anthropic",
+                            "auth": {
+                                "header": {
+                                    "x-api-key": "test-key"
+                                }
+                            },
+                            "options": {
+                                "model": "claude-3-5-sonnet-20241022"
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980";
+                            },
+                            "ssl_verify": false
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 48: Anthropic streaming accumulates total_tokens from split 
message_start and message_delta events
+--- request
+POST /v1/messages
+{"messages":[{"role":"user","content":"Hello"}],"model":"claude-3-5-sonnet-20241022","max_tokens":1024,"stream":true}
+--- more_headers
+X-AI-Fixture: anthropic/messages-streaming.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" claude-3-5-sonnet-20241022 
claude-3-5-sonnet-20241022 [\d.]+ 10 8 18 true false 0 /
+
+
+
+=== TEST 49: Anthropic streaming detects tool_use in content_block_start as 
tool call
+--- request
+POST /v1/messages
+{"messages":[{"role":"user","content":"What is the 
weather?"}],"model":"claude-3-5-sonnet-20241022","max_tokens":1024,"stream":true}
+--- more_headers
+X-AI-Fixture: anthropic/messages-streaming-with-tool-use.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" claude-3-5-sonnet-20241022 
claude-3-5-sonnet-20241022 [\d.]+ 20 5 25 true true 0 /
+
+
+
+=== TEST 50: Anthropic non-streaming writes cache tokens to access log
+--- request
+POST /v1/messages
+{"messages":[{"role":"user","content":"Hello"}],"model":"claude-3-5-sonnet-20241022","max_tokens":1024}
+--- more_headers
+X-AI-Fixture: anthropic/messages-with-cache.json
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" claude-3-5-sonnet-20241022 
claude-3-5-sonnet-20241022 [\d.]+ 50 30 80 false false 0 \S* 200 100 0/
+
+
+
+=== TEST 51: Anthropic streaming writes cache tokens to access log
+--- request
+POST /v1/messages
+{"messages":[{"role":"user","content":"Hello"}],"model":"claude-3-5-sonnet-20241022","max_tokens":1024,"stream":true}
+--- more_headers
+X-AI-Fixture: anthropic/messages-streaming-with-cache.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" claude-3-5-sonnet-20241022 
claude-3-5-sonnet-20241022 [\d.]+ 50 30 80 true false 0 \S* 200 100 0/
diff --git a/t/plugin/ai-proxy3.t b/t/plugin/ai-proxy3.t
index 3110ee04b..501c84952 100644
--- a/t/plugin/ai-proxy3.t
+++ b/t/plugin/ai-proxy3.t
@@ -23,15 +23,6 @@ no_long_string();
 no_root_location();
 
 
-my $resp_file = 't/assets/ai-proxy-response.json';
-open(my $fh, '<', $resp_file) or die "Could not open file '$resp_file' $!";
-my $resp = do { local $/; <$fh> };
-close($fh);
-
-print "Hello, World!\n";
-print $resp;
-
-
 add_block_preprocessor(sub {
     my ($block) = @_;
 
@@ -96,7 +87,7 @@ X-AI-Fixture: openai/chat-basic.json
 --- response_body eval
 qr/.*completion_tokens.*/
 --- access_log eval
-qr/127\.0\.0\.1:1980 200 [\d.]+ 
\"http:\/\/127\.0\.0\.1\/v1\/chat\/completions\" gpt-4 gpt-3.5-turbo [\d.]+ 23 
8.*/
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"http:\/\/\S+\/v1\/chat\/completions\" gpt-4 
gpt-3.5-turbo [\d.]+ 23 8.*/
 
 
 
@@ -256,4 +247,210 @@ passed
 --- response_body_like eval
 qr/6data: \[DONE\]\n\n/
 --- access_log eval
-qr/localhost:7737 200 [\d.]+ \"http:\/\/localhost\/v1\/chat\/completions\" 
gpt-4 gpt-3.5-turbo 2\d\d 15 20.*/
+qr/localhost:7737 200 [\d.]+ \"http:\/\/\S+\/v1\/chat\/completions\" gpt-4 
gpt-3.5-turbo 2\d\d 15 20.*/
+
+
+
+=== TEST 7: set route for built-in access log variable test
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/2',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/log-vars",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "options": {
+                                "model": "gpt-4o"
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980";
+                            },
+                            "ssl_verify": false
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 8: non-streaming request writes llm built-in vars to access log
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"What is 
1+1?"}],"model":"gpt-4o","user":"alice"}
+--- more_headers
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_body eval
+qr/.*completion_tokens.*/
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 23 8 31 false 
false 0 alice/
+
+
+
+=== TEST 9: streaming request writes llm built-in vars to access log
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"Hello"}],"model":"gpt-4o","stream":true}
+--- more_headers
+X-AI-Fixture: openai/chat-multi-chunk.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 10 2 12 true 
false 0 /
+
+
+
+=== TEST 10: response with cached tokens writes llm_cache_read_input_tokens to 
access log
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"Solve this"}],"model":"gpt-4o"}
+--- more_headers
+X-AI-Fixture: openai/chat-with-reasoning.json
+--- error_code: 200
+--- response_body eval
+qr/.*completion_tokens.*/
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 30 15 45 false 
false 0 \S* 10 5 0/
+
+
+
+=== TEST 11: response with tool calls sets llm_has_tool_calls=true and 
llm_tool_count
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"What is the 
weather?"}],"model":"gpt-4o","tools":[{"type":"function","function":{"name":"get_weather","parameters":{}}}]}
+--- more_headers
+X-AI-Fixture: openai/chat-with-tool-calls.json
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 50 20 70 false 
true 1 /
+
+
+
+=== TEST 12: safety_identifier field is used as llm_end_user_id
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"Hello"}],"model":"gpt-4o","safety_identifier":"user-xyz"}
+--- more_headers
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 23 8 31 false 
false 0 user-xyz/
+
+
+
+=== TEST 13: OpenAI Chat streaming detects tool_calls delta and sets 
llm_has_tool_calls=true
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"What is the 
weather?"}],"model":"gpt-4o","stream":true}
+--- more_headers
+X-AI-Fixture: openai/chat-streaming-with-tool-calls.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 15 10 25 true 
true 0 /
+
+
+
+=== TEST 14: set route for Responses API built-in var tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/3',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/ai/v1/responses",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "options": {
+                                "model": "gpt-4o-mini"
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980";
+                            },
+                            "ssl_verify": false
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 15: Responses API streaming sets llm_cache_read_input_tokens and 
llm_reasoning_tokens
+--- request
+POST /ai/v1/responses
+{"input":"Hello","model":"gpt-4o-mini","stream":true}
+--- more_headers
+X-AI-Fixture: openai/responses-streaming-with-cache.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o-mini gpt-4o-mini [\d.]+ 20 5 25 
true false 0 \S* 10 0 3/
+
+
+
+=== TEST 16: Responses API streaming detects function_call in response.output 
as tool call
+--- request
+POST /ai/v1/responses
+{"input":"What is the weather?","model":"gpt-4o-mini","stream":true}
+--- more_headers
+X-AI-Fixture: openai/responses-streaming-with-tool-call.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o-mini gpt-4o-mini [\d.]+ 20 5 25 
true true 0 /
+
+
+
+=== TEST 17: OpenAI Chat streaming writes cache and reasoning tokens to access 
log
+--- request
+POST /log-vars
+{"messages":[{"role":"user","content":"Solve 
this"}],"model":"gpt-4o","stream":true}
+--- more_headers
+X-AI-Fixture: openai/chat-streaming-with-cache.sse
+--- error_code: 200
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o gpt-4o [\d.]+ 30 15 45 true 
false 0 \S* 10 5 7/
+
+
+
+=== TEST 18: Responses API non-streaming writes cache and reasoning tokens to 
access log
+--- request
+POST /ai/v1/responses
+{"input":"Solve this","model":"gpt-4o-mini"}
+--- more_headers
+X-AI-Fixture: openai/responses-with-cache.json
+--- error_code: 200
+--- response_body eval
+qr/.*output.*/
+--- access_log eval
+qr/127\.0\.0\.1:1980 200 [\d.]+ \"\S+\" gpt-4o-mini gpt-4o-mini [\d.]+ 40 20 
60 false false 0 \S* 12 0 8/

(apisix) branch master updated: feat(ai-proxy): add built-in nginx variables for LLM observability (#13477)

Reply via email to