This is an automated email from the ASF dual-hosted git repository.
shreemaanabhishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new 7d8eb88ae fix(ai-rate-limiting): not allowed to limit to a single
instance (#12061)
7d8eb88ae is described below
commit 7d8eb88ae91d0fb594781781cd13ecebf1bef191
Author: Shreemaan Abhishek <[email protected]>
AuthorDate: Tue Mar 18 12:20:13 2025 +0545
fix(ai-rate-limiting): not allowed to limit to a single instance (#12061)
* fix(ai-rate-limiting): not allowed to limit to a single instance
* add docs
* lint
---
apisix/plugins/ai-rate-limiting.lua | 31 ++-
docs/en/latest/plugins/ai-rate-limiting.md | 8 +-
t/plugin/ai-rate-limiting.t | 369 ++++++++++++++++++++++++++++-
3 files changed, 399 insertions(+), 9 deletions(-)
diff --git a/apisix/plugins/ai-rate-limiting.lua
b/apisix/plugins/ai-rate-limiting.lua
index 631f1ab49..d8bf970fa 100644
--- a/apisix/plugins/ai-rate-limiting.lua
+++ b/apisix/plugins/ai-rate-limiting.lua
@@ -47,7 +47,8 @@ local schema = {
},
instances = {
type = "array",
- items = instance_limit_schema
+ items = instance_limit_schema,
+ minItems = 1,
},
rejected_code = {
type = "integer", minimum = 200, maximum = 599, default = 503
@@ -56,7 +57,18 @@ local schema = {
type = "string", minLength = 1
},
},
- required = {"limit", "time_window"},
+ dependencies = {
+ limit = {"time_window"},
+ time_window = {"limit"}
+ },
+ anyOf = {
+ {
+ required = {"limit", "time_window"}
+ },
+ {
+ required = {"instances"}
+ }
+ }
}
local _M = {
@@ -112,6 +124,10 @@ end
local function fetch_limit_conf_kvs(conf)
local mt = {
__index = function(t, k)
+ if not conf.limit then
+ return nil
+ end
+
local limit_conf = transform_limit_conf(conf, nil, k)
t[k] = limit_conf
return limit_conf
@@ -134,6 +150,9 @@ function _M.access(conf, ctx)
local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs,
conf)
local limit_conf = limit_conf_kvs[ai_instance_name]
+ if not limit_conf then
+ return
+ end
local code, msg = limit_count.rate_limit(limit_conf, ctx, plugin_name, 1,
true)
ctx.ai_rate_limiting = code and true or false
return code, msg
@@ -164,6 +183,10 @@ function _M.check_instance_status(conf, ctx, instance_name)
local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs,
conf)
local limit_conf = limit_conf_kvs[instance_name]
+ if not limit_conf then
+ return true
+ end
+
local code, _ = limit_count.rate_limit(limit_conf, ctx, plugin_name, 1,
true)
if code then
core.log.info("rate limit for instance: ", instance_name, " code: ",
code)
@@ -202,7 +225,9 @@ function _M.log(conf, ctx)
local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs,
conf)
local limit_conf = limit_conf_kvs[instance_name]
- limit_count.rate_limit(limit_conf, ctx, plugin_name, used_tokens)
+ if limit_conf then
+ limit_count.rate_limit(limit_conf, ctx, plugin_name, used_tokens)
+ end
end
diff --git a/docs/en/latest/plugins/ai-rate-limiting.md
b/docs/en/latest/plugins/ai-rate-limiting.md
index 839818153..d58eed528 100644
--- a/docs/en/latest/plugins/ai-rate-limiting.md
+++ b/docs/en/latest/plugins/ai-rate-limiting.md
@@ -35,17 +35,19 @@ The `ai-rate-limiting` plugin enforces token-based rate
limiting for requests se
| Name | Type | Required | Description
|
| ------------------------- | ------------- | -------- |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
-| `limit` | integer | false | The maximum number of
tokens allowed to consume within a given time interval. At least one of `limit`
and `instances.limit` should be configured.
|
-| `time_window` | integer | false | The time interval
corresponding to the rate limiting `limit` in seconds. At least one of
`time_window` and `instances.time_window` should be configured.
|
+| `limit` | integer | conditionally | The maximum
number of tokens allowed to consume within a given time interval. At least one
of `limit` and `instances.limit` should be configured.
|
+| `time_window` | integer | conditionally | The time
interval corresponding to the rate limiting `limit` in seconds. At least one of
`time_window` and `instances.time_window` should be configured.
|
| `show_limit_quota_header` | boolean | false | If true, include
`X-AI-RateLimit-Limit-*` to show the total quota, `X-AI-RateLimit-Remaining-*`
to show the remaining quota in the response header, and
`X-AI-RateLimit-Reset-*` to show the number of seconds left for the counter to
reset, where `*` is the instance name. Default: `true` |
| `limit_strategy` | string | false | Type of token to
apply rate limiting. `total_tokens`, `prompt_tokens`, and `completion_tokens`
values are returned in each model response, where `total_tokens` is the sum of
`prompt_tokens` and `completion_tokens`. Default: `total_tokens`
|
-| `instances` | array[object] | false | LLM instance rate
limiting configurations.
|
+| `instances` | array[object] | conditionally | LLM instance
rate limiting configurations.
|
| `instances.name` | string | true | Name of the LLM
service instance.
|
| `instances.limit` | integer | true | The maximum number of
tokens allowed to consume within a given time interval.
|
| `instances.time_window` | integer | true | The time interval
corresponding to the rate limiting `limit` in seconds.
|
| `rejected_code` | integer | false | The HTTP status code
returned when a request exceeding the quota is rejected. Default: `503`
|
| `rejected_msg` | string | false | The response body
returned when a request exceeding the quota is rejected.
|
+If `limit` is configured, `time_window` also needs to be configured. Else,
just specifying `instances` will also suffice.
+
## Example
Create a route as such and update with your LLM providers, models, API keys,
and endpoints:
diff --git a/t/plugin/ai-rate-limiting.t b/t/plugin/ai-rate-limiting.t
index c987cda2e..8ac6677e0 100644
--- a/t/plugin/ai-rate-limiting.t
+++ b/t/plugin/ai-rate-limiting.t
@@ -203,12 +203,42 @@ __DATA__
}
},
},
+ {
+ limit = 30,
+ instances = {
+ {
+ name = "instance1",
+ limit = 30,
+ time_window = 60,
+ }
+ },
+ },
+ {
+ instances = {
+ {
+ name = "instance1",
+ limit = 30,
+ time_window = 60,
+ }
+ },
+ },
{
limit = 30,
time_window = 60,
rejected_code = 403,
rejected_msg = "rate limit exceeded",
limit_strategy = "completion_tokens",
+ },
+ {
+ limit = 30,
+ time_window = 60,
+ instances = {
+ {
+ name = "instance1",
+ limit = 30,
+ time_window = 60,
+ }
+ },
}
}
local core = require("apisix.core")
@@ -225,12 +255,15 @@ __DATA__
}
}
--- response_body
-property "limit" is required
-property "time_window" is required
+property "limit" is required when "time_window" is set
+property "time_window" is required when "limit" is set
property "rejected_code" validation failed: expected 199 to be at least 200
property "limit_strategy" validation failed: matches none of the enum values
property "instances" validation failed: failed to validate item 2: property
"name" is required
-property "limit" is required
+property "limit" is required when "time_window" is set
+property "time_window" is required when "limit" is set
+passed
+passed
passed
done
@@ -682,3 +715,333 @@ passed
}
--- response_body
passed
+
+
+
+=== TEST 15: limiting to only one instance
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/ai",
+ "plugins": {
+ "ai-proxy-multi": {
+ "fallback_strategy":
"instance_health_and_rate_limiting",
+ "instances": [
+ {
+ "name": "openai-gpt4",
+ "provider": "openai",
+ "weight": 1,
+ "priority": 1,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ },
+ "override": {
+ "endpoint": "http://localhost:16724"
+ }
+ },
+ {
+ "name": "openai-gpt3",
+ "provider": "openai",
+ "weight": 1,
+ "priority": 0,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-3"
+ },
+ "override": {
+ "endpoint": "http://localhost:16724"
+ }
+ }
+ ],
+ "ssl_verify": false
+ },
+ "ai-rate-limiting": {
+ "instances": [
+ {
+ "name": "openai-gpt4",
+ "limit": 20,
+ "time_window": 60
+ }
+ ]
+ }
+ },
+ "upstream": {
+ "type": "roundrobin",
+ "nodes": {
+ "canbeanything.com": 1
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 16: 10 requests, 8 should be handled by gpt-3, 2 should be handled by
gpt-4
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local core = require("apisix.core")
+
+ local instances_count = {}
+ for i = 1, 10 do
+ local code, _, body = t("/ai",
+ ngx.HTTP_POST,
+ [[{
+ "messages": [
+ { "role": "system", "content": "You are a
mathematician" },
+ { "role": "user", "content": "What is 1+1?" }
+ ]
+ }]],
+ nil,
+ {
+ ["test-type"] = "options",
+ ["Content-Type"] = "application/json",
+ }
+ )
+ assert(code == 200, "first request should be successful")
+ if core.string.find(body, "gpt-4") then
+ instances_count["gpt-4"] = (instances_count["gpt-4"] or 0)
+ 1
+ else
+ instances_count["gpt-3"] = (instances_count["gpt-3"] or 0)
+ 1
+ end
+ end
+
+ ngx.log(ngx.INFO, "instances_count test:",
core.json.delay_encode(instances_count))
+
+ assert(instances_count["gpt-4"] <= 2, "gpt-4 should be handled by
higher priority instance")
+ assert(instances_count["gpt-3"] >= 8, "gpt-3 should be handled by
lower priority instance")
+ ngx.say("passed")
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 17: each instance uses different current limiting
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/ai",
+ "plugins": {
+ "ai-proxy-multi": {
+ "fallback_strategy":
"instance_health_and_rate_limiting",
+ "instances": [
+ {
+ "name": "openai-gpt4",
+ "provider": "openai",
+ "weight": 1,
+ "priority": 1,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ },
+ "override": {
+ "endpoint": "http://localhost:16724"
+ }
+ },
+ {
+ "name": "openai-gpt3",
+ "provider": "openai",
+ "weight": 1,
+ "priority": 0,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-3"
+ },
+ "override": {
+ "endpoint": "http://localhost:16724"
+ }
+ }
+ ],
+ "ssl_verify": false
+ },
+ "ai-rate-limiting": {
+ "instances": [
+ {
+ "name": "openai-gpt3",
+ "limit": 50,
+ "time_window": 60
+ },
+ {
+ "name": "openai-gpt4",
+ "limit": 20,
+ "time_window": 60
+ }
+ ]
+ }
+ },
+ "upstream": {
+ "type": "roundrobin",
+ "nodes": {
+ "canbeanything.com": 1
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 18: gpt3 allows 5 requests, gpt4 allows 2 requests
+--- pipelined_requests eval
+[
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+]
+--- more_headers
+Authorization: Bearer token
+--- error_code eval
+[200, 200, 200, 200, 200, 200, 200, 503, 503]
+
+
+
+=== TEST 19: set limit & instances
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/ai",
+ "plugins": {
+ "ai-proxy-multi": {
+ "fallback_strategy":
"instance_health_and_rate_limiting",
+ "instances": [
+ {
+ "name": "openai-gpt4",
+ "provider": "openai",
+ "weight": 1,
+ "priority": 1,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ },
+ "override": {
+ "endpoint": "http://localhost:16724"
+ }
+ },
+ {
+ "name": "openai-gpt3",
+ "provider": "openai",
+ "weight": 1,
+ "priority": 0,
+ "auth": {
+ "header": {
+ "Authorization": "Bearer token"
+ }
+ },
+ "options": {
+ "model": "gpt-3"
+ },
+ "override": {
+ "endpoint": "http://localhost:16724"
+ }
+ }
+ ],
+ "ssl_verify": false
+ },
+ "ai-rate-limiting": {
+ "limit": 20,
+ "time_window": 60,
+ "instances": [
+ {
+ "name": "openai-gpt3",
+ "limit": 50,
+ "time_window": 60
+ }
+ ]
+ }
+ },
+ "upstream": {
+ "type": "roundrobin",
+ "nodes": {
+ "canbeanything.com": 1
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 20: gpt3 allows 5 requests, gpt4 allows 2 requests
+--- pipelined_requests eval
+[
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+ "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\":
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is
1+1?\"} ] }",
+]
+--- more_headers
+Authorization: Bearer token
+--- error_code eval
+[200, 200, 200, 200, 200, 200, 200, 503, 503]