(apisix) branch master updated: fix(ai-rate-limiting): not allowed to limit to a single instance (#12061)

shreemaanabhishek Fri, 04 Apr 2025 13:53:31 -0700

This is an automated email from the ASF dual-hosted git repository.

shreemaanabhishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git



The following commit(s) were added to refs/heads/master by this push:
     new 7d8eb88ae fix(ai-rate-limiting): not allowed to limit to a single 
instance (#12061)
7d8eb88ae is described below

commit 7d8eb88ae91d0fb594781781cd13ecebf1bef191
Author: Shreemaan Abhishek <[email protected]>
AuthorDate: Tue Mar 18 12:20:13 2025 +0545

    fix(ai-rate-limiting): not allowed to limit to a single instance (#12061)
    
    * fix(ai-rate-limiting): not allowed to limit to a single instance
    
    * add docs
    
    * lint
---
 apisix/plugins/ai-rate-limiting.lua        |  31 ++-
 docs/en/latest/plugins/ai-rate-limiting.md |   8 +-
 t/plugin/ai-rate-limiting.t                | 369 ++++++++++++++++++++++++++++-
 3 files changed, 399 insertions(+), 9 deletions(-)

diff --git a/apisix/plugins/ai-rate-limiting.lua 
b/apisix/plugins/ai-rate-limiting.lua
index 631f1ab49..d8bf970fa 100644
--- a/apisix/plugins/ai-rate-limiting.lua
+++ b/apisix/plugins/ai-rate-limiting.lua
@@ -47,7 +47,8 @@ local schema = {
         },
         instances = {
             type = "array",
-            items = instance_limit_schema
+            items = instance_limit_schema,
+            minItems = 1,
         },
         rejected_code = {
             type = "integer", minimum = 200, maximum = 599, default = 503
@@ -56,7 +57,18 @@ local schema = {
             type = "string", minLength = 1
         },
     },
-    required = {"limit", "time_window"},
+    dependencies = {
+        limit = {"time_window"},
+        time_window = {"limit"}
+    },
+    anyOf = {
+        {
+            required = {"limit", "time_window"}
+        },
+        {
+            required = {"instances"}
+        }
+    }
 }
 
 local _M = {
@@ -112,6 +124,10 @@ end
 local function fetch_limit_conf_kvs(conf)
     local mt = {
         __index = function(t, k)
+            if not conf.limit then
+                return nil
+            end
+
             local limit_conf = transform_limit_conf(conf, nil, k)
             t[k] = limit_conf
             return limit_conf
@@ -134,6 +150,9 @@ function _M.access(conf, ctx)
 
     local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs, 
conf)
     local limit_conf = limit_conf_kvs[ai_instance_name]
+    if not limit_conf then
+        return
+    end
     local code, msg = limit_count.rate_limit(limit_conf, ctx, plugin_name, 1, 
true)
     ctx.ai_rate_limiting = code and true or false
     return code, msg
@@ -164,6 +183,10 @@ function _M.check_instance_status(conf, ctx, instance_name)
 
     local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs, 
conf)
     local limit_conf = limit_conf_kvs[instance_name]
+    if not limit_conf then
+        return true
+    end
+
     local code, _ = limit_count.rate_limit(limit_conf, ctx, plugin_name, 1, 
true)
     if code then
         core.log.info("rate limit for instance: ", instance_name, " code: ", 
code)
@@ -202,7 +225,9 @@ function _M.log(conf, ctx)
 
     local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs, 
conf)
     local limit_conf = limit_conf_kvs[instance_name]
-    limit_count.rate_limit(limit_conf, ctx, plugin_name, used_tokens)
+    if limit_conf then
+        limit_count.rate_limit(limit_conf, ctx, plugin_name, used_tokens)
+    end
 end
 
 
diff --git a/docs/en/latest/plugins/ai-rate-limiting.md 
b/docs/en/latest/plugins/ai-rate-limiting.md
index 839818153..d58eed528 100644
--- a/docs/en/latest/plugins/ai-rate-limiting.md
+++ b/docs/en/latest/plugins/ai-rate-limiting.md
@@ -35,17 +35,19 @@ The `ai-rate-limiting` plugin enforces token-based rate 
limiting for requests se
 
 | Name                      | Type          | Required | Description           
                                                                                
                                                                                
                                                                                
                        |
 | ------------------------- | ------------- | -------- | 
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
-| `limit`                   | integer       | false    | The maximum number of 
tokens allowed to consume within a given time interval. At least one of `limit` 
and `instances.limit` should be configured.                                     
                                                                                
                        |
-| `time_window`             | integer       | false    | The time interval 
corresponding to the rate limiting `limit` in seconds. At least one of 
`time_window` and `instances.time_window` should be configured.                 
                                                                                
                                     |
+| `limit`                   | integer       | conditionally    | The maximum 
number of tokens allowed to consume within a given time interval. At least one 
of `limit` and `instances.limit` should be configured.                          
                                                                                
                                   |
+| `time_window`             | integer       | conditionally    | The time 
interval corresponding to the rate limiting `limit` in seconds. At least one of 
`time_window` and `instances.time_window` should be configured.                 
                                                                                
                                     |
 | `show_limit_quota_header` | boolean       | false    | If true, include 
`X-AI-RateLimit-Limit-*` to show the total quota, `X-AI-RateLimit-Remaining-*` 
to show the remaining quota in the response header, and 
`X-AI-RateLimit-Reset-*` to show the number of seconds left for the counter to 
reset, where `*` is the instance name. Default: `true` |
 | `limit_strategy`          | string        | false    | Type of token to 
apply rate limiting. `total_tokens`, `prompt_tokens`, and `completion_tokens` 
values are returned in each model response, where `total_tokens` is the sum of 
`prompt_tokens` and `completion_tokens`. Default: `total_tokens`                
                                |
-| `instances`               | array[object] | false    | LLM instance rate 
limiting configurations.                                                        
                                                                                
                                                                                
                            |
+| `instances`               | array[object] | conditionally    | LLM instance 
rate limiting configurations.                                                   
                                                                                
                                                                                
                                 |
 | `instances.name`          | string        | true     | Name of the LLM 
service instance.                                                               
                                                                                
                                                                                
                              |
 | `instances.limit`         | integer       | true     | The maximum number of 
tokens allowed to consume within a given time interval.                         
                                                                                
                                                                                
                        |
 | `instances.time_window`   | integer       | true     | The time interval 
corresponding to the rate limiting `limit` in seconds.                          
                                                                                
                                                                                
                            |
 | `rejected_code`           | integer       | false    | The HTTP status code 
returned when a request exceeding the quota is rejected. Default: `503`         
                                                                                
                                                                                
                         |
 | `rejected_msg`            | string        | false    | The response body 
returned when a request exceeding the quota is rejected.                        
                                                                                
                                                                                
                            |
 
+If `limit` is configured, `time_window` also needs to be configured. Else, 
just specifying `instances` will also suffice.
+
 ## Example
 
 Create a route as such and update with your LLM providers, models, API keys, 
and endpoints:
diff --git a/t/plugin/ai-rate-limiting.t b/t/plugin/ai-rate-limiting.t
index c987cda2e..8ac6677e0 100644
--- a/t/plugin/ai-rate-limiting.t
+++ b/t/plugin/ai-rate-limiting.t
@@ -203,12 +203,42 @@ __DATA__
                         }
                     },
                 },
+                {
+                    limit = 30,
+                    instances = {
+                        {
+                            name = "instance1",
+                            limit = 30,
+                            time_window = 60,
+                        }
+                    },
+                },
+                {
+                    instances = {
+                        {
+                            name = "instance1",
+                            limit = 30,
+                            time_window = 60,
+                        }
+                    },
+                },
                 {
                     limit = 30,
                     time_window = 60,
                     rejected_code = 403,
                     rejected_msg = "rate limit exceeded",
                     limit_strategy = "completion_tokens",
+                },
+                {
+                    limit = 30,
+                    time_window = 60,
+                    instances = {
+                        {
+                            name = "instance1",
+                            limit = 30,
+                            time_window = 60,
+                        }
+                    },
                 }
             }
             local core = require("apisix.core")
@@ -225,12 +255,15 @@ __DATA__
         }
     }
 --- response_body
-property "limit" is required
-property "time_window" is required
+property "limit" is required when "time_window" is set
+property "time_window" is required when "limit" is set
 property "rejected_code" validation failed: expected 199 to be at least 200
 property "limit_strategy" validation failed: matches none of the enum values
 property "instances" validation failed: failed to validate item 2: property 
"name" is required
-property "limit" is required
+property "limit" is required when "time_window" is set
+property "time_window" is required when "limit" is set
+passed
+passed
 passed
 done
 
@@ -682,3 +715,333 @@ passed
     }
 --- response_body
 passed
+
+
+
+=== TEST 15: limiting to only one instance
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/ai",
+                    "plugins": {
+                        "ai-proxy-multi": {
+                            "fallback_strategy": 
"instance_health_and_rate_limiting",
+                            "instances": [
+                                {
+                                    "name": "openai-gpt4",
+                                    "provider": "openai",
+                                    "weight": 1,
+                                    "priority": 1,
+                                    "auth": {
+                                        "header": {
+                                            "Authorization": "Bearer token"
+                                        }
+                                    },
+                                    "options": {
+                                        "model": "gpt-4"
+                                    },
+                                    "override": {
+                                        "endpoint": "http://localhost:16724";
+                                    }
+                                },
+                                {
+                                    "name": "openai-gpt3",
+                                    "provider": "openai",
+                                    "weight": 1,
+                                    "priority": 0,
+                                    "auth": {
+                                        "header": {
+                                            "Authorization": "Bearer token"
+                                        }
+                                    },
+                                    "options": {
+                                        "model": "gpt-3"
+                                    },
+                                    "override": {
+                                        "endpoint": "http://localhost:16724";
+                                    }
+                                }
+                            ],
+                            "ssl_verify": false
+                        },
+                        "ai-rate-limiting": {
+                            "instances": [
+                                {
+                                    "name": "openai-gpt4",
+                                    "limit": 20,
+                                    "time_window": 60
+                                }
+                            ]
+                        }
+                    },
+                    "upstream": {
+                        "type": "roundrobin",
+                        "nodes": {
+                            "canbeanything.com": 1
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 16: 10 requests, 8 should be handled by gpt-3, 2 should be handled by 
gpt-4
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local core = require("apisix.core")
+
+            local instances_count = {}
+            for i = 1, 10 do
+                local code, _, body = t("/ai",
+                    ngx.HTTP_POST,
+                    [[{
+                        "messages": [
+                            { "role": "system", "content": "You are a 
mathematician" },
+                            { "role": "user", "content": "What is 1+1?" }
+                        ]
+                    }]],
+                    nil,
+                    {
+                        ["test-type"] = "options",
+                        ["Content-Type"] = "application/json",
+                    }
+                )
+                assert(code == 200, "first request should be successful")
+                if core.string.find(body, "gpt-4") then
+                    instances_count["gpt-4"] = (instances_count["gpt-4"] or 0) 
+ 1
+                else
+                    instances_count["gpt-3"] = (instances_count["gpt-3"] or 0) 
+ 1
+                end
+            end
+
+            ngx.log(ngx.INFO, "instances_count test:", 
core.json.delay_encode(instances_count))
+
+            assert(instances_count["gpt-4"] <= 2, "gpt-4 should be handled by 
higher priority instance")
+            assert(instances_count["gpt-3"] >= 8, "gpt-3 should be handled by 
lower priority instance")
+            ngx.say("passed")
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 17: each instance uses different current limiting
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/ai",
+                    "plugins": {
+                        "ai-proxy-multi": {
+                            "fallback_strategy": 
"instance_health_and_rate_limiting",
+                            "instances": [
+                                {
+                                    "name": "openai-gpt4",
+                                    "provider": "openai",
+                                    "weight": 1,
+                                    "priority": 1,
+                                    "auth": {
+                                        "header": {
+                                            "Authorization": "Bearer token"
+                                        }
+                                    },
+                                    "options": {
+                                        "model": "gpt-4"
+                                    },
+                                    "override": {
+                                        "endpoint": "http://localhost:16724";
+                                    }
+                                },
+                                {
+                                    "name": "openai-gpt3",
+                                    "provider": "openai",
+                                    "weight": 1,
+                                    "priority": 0,
+                                    "auth": {
+                                        "header": {
+                                            "Authorization": "Bearer token"
+                                        }
+                                    },
+                                    "options": {
+                                        "model": "gpt-3"
+                                    },
+                                    "override": {
+                                        "endpoint": "http://localhost:16724";
+                                    }
+                                }
+                            ],
+                            "ssl_verify": false
+                        },
+                        "ai-rate-limiting": {
+                            "instances": [
+                                {
+                                    "name": "openai-gpt3",
+                                    "limit": 50,
+                                    "time_window": 60
+                                },
+                                {
+                                    "name": "openai-gpt4",
+                                    "limit": 20,
+                                    "time_window": 60
+                                }
+                            ]
+                        }
+                    },
+                    "upstream": {
+                        "type": "roundrobin",
+                        "nodes": {
+                            "canbeanything.com": 1
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 18: gpt3 allows 5 requests, gpt4 allows 2 requests
+--- pipelined_requests eval
+[
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+]
+--- more_headers
+Authorization: Bearer token
+--- error_code eval
+[200, 200, 200, 200, 200, 200, 200, 503, 503]
+
+
+
+=== TEST 19: set limit & instances
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/ai",
+                    "plugins": {
+                        "ai-proxy-multi": {
+                            "fallback_strategy": 
"instance_health_and_rate_limiting",
+                            "instances": [
+                                {
+                                    "name": "openai-gpt4",
+                                    "provider": "openai",
+                                    "weight": 1,
+                                    "priority": 1,
+                                    "auth": {
+                                        "header": {
+                                            "Authorization": "Bearer token"
+                                        }
+                                    },
+                                    "options": {
+                                        "model": "gpt-4"
+                                    },
+                                    "override": {
+                                        "endpoint": "http://localhost:16724";
+                                    }
+                                },
+                                {
+                                    "name": "openai-gpt3",
+                                    "provider": "openai",
+                                    "weight": 1,
+                                    "priority": 0,
+                                    "auth": {
+                                        "header": {
+                                            "Authorization": "Bearer token"
+                                        }
+                                    },
+                                    "options": {
+                                        "model": "gpt-3"
+                                    },
+                                    "override": {
+                                        "endpoint": "http://localhost:16724";
+                                    }
+                                }
+                            ],
+                            "ssl_verify": false
+                        },
+                        "ai-rate-limiting": {
+                            "limit": 20,
+                            "time_window": 60,
+                            "instances": [
+                                {
+                                    "name": "openai-gpt3",
+                                    "limit": 50,
+                                    "time_window": 60
+                                }
+                            ]
+                        }
+                    },
+                    "upstream": {
+                        "type": "roundrobin",
+                        "nodes": {
+                            "canbeanything.com": 1
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 20: gpt3 allows 5 requests, gpt4 allows 2 requests
+--- pipelined_requests eval
+[
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+    "POST /ai\n" . "{ \"messages\": [ { \"role\": \"system\", \"content\": 
\"You are a mathematician\" }, { \"role\": \"user\", \"content\": \"What is 
1+1?\"} ] }",
+]
+--- more_headers
+Authorization: Bearer token
+--- error_code eval
+[200, 200, 200, 200, 200, 200, 200, 503, 503]

(apisix) branch master updated: fix(ai-rate-limiting): not allowed to limit to a single instance (#12061)

Reply via email to