Copilot commented on code in PR #12991: URL: https://github.com/apache/apisix/pull/12991#discussion_r2802513724
########## t/node/warm-up.t: ########## @@ -0,0 +1,663 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +repeat_each(1); +log_level('warn'); +no_root_location(); +no_shuffle(); + +run_tests(); + +__DATA__ + +=== TEST 1: create upstream with single node (starts warm-up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + -- 1. Create Upstream + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100, "update_time": ]] .. ngx.time() - 10 .. [[} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say("upstream failed: ", body) + return + end + + -- 2. Create Route using Upstream + code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/server_port", + "upstream_id": "1" + }]] + ) + + if code >= 300 then + ngx.status = code + ngx.say("route failed: ", body) + return + end + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 2: add new node (auto-detects as new and warms up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). + -- The new node (1981) should get a new update_time (so it starts warming up). + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 3: verify warm-up traffic skew (Node 1980 >> Node 1981) +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + -- Node 1980: fully warmed (weight 100) + -- Node 1981: just started (weight ~1) + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Warm-up check: 1980=", count_80, ", 1981=", count_81) + + -- Expect heavy skew to 1980 + if count_80 >= 7 and count_81 <= 3 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 4: wait for warm-up to complete +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local upstream = require("apisix.upstream").get_by_id(1) + + if upstream and upstream.nodes then + local max_update_time = 0 + for _, node in ipairs(upstream.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + + if max_update_time > 0 then + local now = ngx.time() + local warm_up_duration = upstream.warm_up_conf.slow_start_time_seconds + local elapsed = now - max_update_time + + if elapsed < warm_up_duration then + ngx.sleep(warm_up_duration - elapsed) + end + end + else + ngx.say("cannot find upstream id: 1") + end + + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 5: verify balanced traffic after warm-up +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Balanced check: 1980=", count_80, ", 1981=", count_81) + + -- Expect balanced traffic + if count_80 == count_81 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 6: add domain node +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). Review Comment: The comment states "The existing node (1980) should preserve its update_time (so it stays fully warmed)" but this is incorrect. Changing the host from "127.0.0.1" to "localhost" for port 1980 changes the node identification key from "127.0.0.1:1980" to "localhost:1980". According to the node identification logic in apisix/admin/upstreams.lua lines 40 and 47, nodes are identified by "(node.domain or node.host) .. ':' .. tostring(node.port)". Since the key changed, this node will NOT preserve its update_time and will instead be treated as a new node and assigned a new update_time, triggering warm-up. The comment should be corrected to reflect that this node will be treated as new and will start warming up. ```suggestion -- The node on port 1980, with host changed to "localhost", will be treated as a new node and get a new update_time (so it starts warming up). ``` ########## apisix/utils/upstream.lua: ########## @@ -26,7 +26,10 @@ local ngx = ngx local _M = {} -local function sort_by_key_host(a, b) +local function sort_by_key_host_port(a, b) + if a.host == b.host then + return a.port < b.port + end return a.host < b.host end Review Comment: The sort function `sort_by_key_host_port` only sorts by `host` and `port`, but doesn't consider the `domain` field. When DNS resolution occurs, nodes get a `domain` field set to the original hostname while `host` becomes the resolved IP address. If two nodes have the same resolved IP and port but different domain names, they would be considered identical by this sort function, but should actually be treated as different nodes. This could cause incorrect node comparisons and affect warm-up timestamp preservation. Consider including the `domain` field in the sort comparison, or ensure that the `domain` field is consistently handled when comparing nodes. ########## apisix/balancer.lua: ########## @@ -178,6 +197,35 @@ local function set_balancer_opts(route, ctx) end +local function get_version_with_warm_up(version, up_conf) + if not up_conf.warm_up_conf or up_conf.warm_up_conf.finish then + return version + end + local warm_up_conf = up_conf.warm_up_conf + local warm_up_end_time = warm_up_conf.warm_up_end_time + if not warm_up_end_time then + local max_update_time = 0 + for _, node in ipairs(up_conf.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + if max_update_time > 0 then + warm_up_end_time = max_update_time + warm_up_conf.slow_start_time_seconds + warm_up_conf.warm_up_end_time = warm_up_end_time + end + end + + if warm_up_end_time and ngx_time() < warm_up_end_time then + version = version .. math_floor(ngx_time() / warm_up_conf.interval) + else + warm_up_conf.finish = true + end Review Comment: The code mutates the `warm_up_conf` table by setting `warm_up_conf.warm_up_end_time` (line 215) and `warm_up_conf.finish` (line 222). Since `warm_up_conf` is part of the upstream configuration which may be shared across multiple requests and workers, this mutation could cause race conditions or unexpected behavior. Consider using a separate cache or data structure to track warm-up state that doesn't modify the original configuration object, or ensure proper synchronization if the configuration is shared. ```suggestion -- store warm-up runtime state separately from the upstream configuration local upstream_warmup_state = setmetatable({}, { __mode = "k" }) local function get_version_with_warm_up(version, up_conf) local warm_up_conf = up_conf.warm_up_conf if not warm_up_conf then return version end -- use a separate state table so we don't mutate the configuration object local state = upstream_warmup_state[warm_up_conf] if not state then state = {} upstream_warmup_state[warm_up_conf] = state end if state.finish then return version end local warm_up_end_time = state.warm_up_end_time if not warm_up_end_time then local max_update_time = 0 for _, node in ipairs(up_conf.nodes) do if node.update_time and node.update_time > max_update_time then max_update_time = node.update_time end end if max_update_time > 0 then warm_up_end_time = max_update_time + warm_up_conf.slow_start_time_seconds state.warm_up_end_time = warm_up_end_time end end if warm_up_end_time and ngx_time() < warm_up_end_time then version = version .. math_floor(ngx_time() / warm_up_conf.interval) else state.finish = true end ``` ########## t/node/warm-up.t: ########## @@ -0,0 +1,663 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +repeat_each(1); +log_level('warn'); +no_root_location(); +no_shuffle(); + +run_tests(); + +__DATA__ + +=== TEST 1: create upstream with single node (starts warm-up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + -- 1. Create Upstream + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100, "update_time": ]] .. ngx.time() - 10 .. [[} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say("upstream failed: ", body) + return + end + + -- 2. Create Route using Upstream + code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/server_port", + "upstream_id": "1" + }]] + ) + + if code >= 300 then + ngx.status = code + ngx.say("route failed: ", body) + return + end + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 2: add new node (auto-detects as new and warms up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). + -- The new node (1981) should get a new update_time (so it starts warming up). + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 3: verify warm-up traffic skew (Node 1980 >> Node 1981) +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + -- Node 1980: fully warmed (weight 100) + -- Node 1981: just started (weight ~1) + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Warm-up check: 1980=", count_80, ", 1981=", count_81) + + -- Expect heavy skew to 1980 + if count_80 >= 7 and count_81 <= 3 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 4: wait for warm-up to complete +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local upstream = require("apisix.upstream").get_by_id(1) + + if upstream and upstream.nodes then + local max_update_time = 0 + for _, node in ipairs(upstream.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + + if max_update_time > 0 then + local now = ngx.time() + local warm_up_duration = upstream.warm_up_conf.slow_start_time_seconds + local elapsed = now - max_update_time + + if elapsed < warm_up_duration then + ngx.sleep(warm_up_duration - elapsed) + end + end + else + ngx.say("cannot find upstream id: 1") + end + + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 5: verify balanced traffic after warm-up +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Balanced check: 1980=", count_80, ", 1981=", count_81) + + -- Expect balanced traffic + if count_80 == count_81 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 6: add domain node +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). + -- The new node (1981) should get a new update_time (so it starts warming up). + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "localhost", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 7: verify warm-up traffic skew for domain(Node 1980 << Node 1981) +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + -- Node 1980: fully warmed (weight 100) + -- Node 1981: just started (weight ~1) Review Comment: The test comments are misleading. On lines 288-289, the comments state "Node 1980: fully warmed (weight 100)" and "Node 1981: just started (weight ~1)", but this is incorrect. In TEST 6, the host for port 1980 was changed from "127.0.0.1" to "localhost", which changes the node identification key from "127.0.0.1:1980" to "localhost:1980". This causes the node to be treated as new and triggers warm-up. Meanwhile, port 1981 (127.0.0.1:1981) was added in TEST 2 and should be fully warmed by the time TEST 7 runs (after TEST 4 waits for warm-up completion). The comments should be corrected to: "Node 1980 (localhost): just started (weight ~1)" and "Node 1981: fully warmed (weight 100)" to accurately reflect the test behavior. The expectation on line 307 is correct, but the comments are backwards. ########## docs/zh/latest/admin-api.md: ########## @@ -1031,6 +1031,10 @@ APISIX 的 Upstream 除了基本的负载均衡算法选择外,还支持对上 |keepalive_pool.size | 否 | 辅助 | 动态设置 `keepalive` 指令,详细信息请参考下文。 | |keepalive_pool.idle_timeout | 否 | 辅助 | 动态设置 `keepalive_timeout` 指令,详细信息请参考下文。 | |keepalive_pool.requests | 否 | 辅助 | 动态设置 `keepalive_requests` 指令,详细信息请参考下文。 | +|warm_up_conf.slow_start_time_seconds | 是 | integer | 预热持续时间(秒)。 | 30 | +|warm_up_conf.min_weight_percent | 是 | integer | 预热开始时的最小权重百分比(1-100)。 | 1 | +|warm_up_conf.aggression | 否 | number | 控制预热期间流量增长的速率,默认为 1。 | 1.0 | +|warm_up_conf.interval | 否 | integer | 权重更新间隔(秒)。 | 1 | Review Comment: The documentation table shows "是" (Yes in Chinese) in the "Required" column for warm_up_conf fields, but according to the schema definition in apisix/schema_def.lua line 444, only "slow_start_time_seconds" and "min_weight_percent" are required fields. The "aggression" and "interval" fields are optional with default values. The table should indicate "否" (No) for aggression and interval in the Required column, or the column header should be changed to clarify what "是" means in this context (e.g., if it means "has default value" rather than "required"). ########## apisix/balancer.lua: ########## @@ -55,7 +57,24 @@ local function transform_node(new_nodes, node) core.table.insert(new_nodes._priority_index, node.priority) end - new_nodes[node.priority][node.host .. ":" .. node.port] = node.weight + local weight = node.weight + if warm_up_conf then + local start_time = node.update_time + if start_time then + local time_since_start_seconds = math_max(ngx_time() - start_time, 1) Review Comment: The warm-up weight calculation on line 64 uses `math_max(ngx_time() - start_time, 1)` to ensure a minimum value of 1. However, if `ngx_time() < start_time` (due to clock skew), this will still result in a negative value being passed through. While clock skew protection exists in apisix/upstream.lua line 490-492, it only caps future timestamps at the current time. If clock skew causes ngx_time() to go backwards after a node's update_time was set, this calculation could produce unexpected behavior. Consider using `math_max(ngx_time() - start_time, 0)` and handling the case where elapsed time is 0 explicitly, or add additional safeguards. ```suggestion local time_since_start_seconds = math_max(ngx_time() - start_time, 0) ``` ########## apisix/admin/upstreams.lua: ########## @@ -26,6 +26,52 @@ local apisix_upstream = require("apisix.upstream") local resource = require("apisix.admin.resource") local tostring = tostring local ipairs = ipairs +local ngx_time = ngx.time + + +local function update_warm_up_timestamps(conf, old_conf) + if not conf or not conf.nodes or not core.table.isarray(conf.nodes) then + return + end + + local previous_node_update_time = {} + if old_conf and old_conf.nodes and core.table.isarray(old_conf.nodes) then + for _, node in ipairs(old_conf.nodes) do + local key = (node.domain or node.host) .. ":" .. tostring(node.port) + previous_node_update_time[key] = node.update_time + end + end + + for _, node in ipairs(conf.nodes) do + if not node.update_time then + local key = (node.domain or node.host) .. ":" .. tostring(node.port) + node.update_time = previous_node_update_time[key] or ngx_time() Review Comment: Clock skew protection is implemented in apisix/upstream.lua lines 488-492 to cap future timestamps at the current time. However, this protection only applies when loading configurations from etcd via the `filter_upstream` function. The `update_time` fields are also set in the admin API via `initialize_conf` in apisix/admin/upstreams.lua line 48, which uses `ngx_time()` without any clock skew protection. If the admin API and data plane workers have clock skew between them, a node could be assigned an `update_time` in the future relative to the worker's clock, and this would only be corrected when the config is loaded from etcd. Consider adding similar clock skew protection in the `initialize_conf` function to ensure consistency. ```suggestion local now = ngx_time() for _, node in ipairs(conf.nodes) do if not node.update_time then local key = (node.domain or node.host) .. ":" .. tostring(node.port) local prev = previous_node_update_time[key] if prev and prev > now then -- clamp future timestamps to the current time to protect against clock skew node.update_time = now else node.update_time = prev or now end ``` ########## docs/en/latest/admin-api.md: ########## @@ -1050,6 +1054,25 @@ Or you can reference SSL object by `tls.client_cert_id` to set SSL cert and key. To allow Upstream to have a separate connection pool, use `keepalive_pool`. It can be configured by modifying its child fields. +Here is the polished English translation of your text: Review Comment: The line "Here is the polished English translation of your text:" appears to be editorial text that was accidentally left in the documentation. This should be removed as it doesn't belong in the final documentation. ########## apisix/balancer.lua: ########## @@ -55,7 +57,24 @@ local function transform_node(new_nodes, node) core.table.insert(new_nodes._priority_index, node.priority) end - new_nodes[node.priority][node.host .. ":" .. node.port] = node.weight + local weight = node.weight + if warm_up_conf then + local start_time = node.update_time + if start_time then + local time_since_start_seconds = math_max(ngx_time() - start_time, 1) + if time_since_start_seconds < warm_up_conf.slow_start_time_seconds then + local time_factor = time_since_start_seconds / warm_up_conf.slow_start_time_seconds + local ramped_weight_ratio = math_max(warm_up_conf.min_weight_percent / 100, + time_factor ^ (1 / warm_up_conf.aggression)) Review Comment: The warm-up weight calculation uses `time_factor ^ (1 / warm_up_conf.aggression)` where aggression comes from user configuration. According to the schema in apisix/schema_def.lua line 433, aggression has `exclusiveMinimum: 0`, which means it can be arbitrarily close to 0 (e.g., 0.0001). When aggression approaches 0, `1 / warm_up_conf.aggression` becomes very large, and `time_factor ^ (very_large_number)` where time_factor is between 0 and 1 will approach 0 rapidly, potentially causing numerical instability or unexpected weight calculations. Consider adding a reasonable minimum bound for aggression (e.g., minimum: 0.01) or adding validation logic to prevent extremely small values. ```suggestion -- Clamp aggression to avoid numerical instability when it is very close to zero local aggression = math_max(warm_up_conf.aggression or 1, 0.01) local ramped_weight_ratio = math_max(warm_up_conf.min_weight_percent / 100, time_factor ^ (1 / aggression)) ``` ########## t/node/warm-up.t: ########## @@ -0,0 +1,663 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +repeat_each(1); +log_level('warn'); +no_root_location(); +no_shuffle(); + +run_tests(); + +__DATA__ + +=== TEST 1: create upstream with single node (starts warm-up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + -- 1. Create Upstream + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100, "update_time": ]] .. ngx.time() - 10 .. [[} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say("upstream failed: ", body) + return + end + + -- 2. Create Route using Upstream + code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/server_port", + "upstream_id": "1" + }]] + ) + + if code >= 300 then + ngx.status = code + ngx.say("route failed: ", body) + return + end + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 2: add new node (auto-detects as new and warms up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). + -- The new node (1981) should get a new update_time (so it starts warming up). + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 3: verify warm-up traffic skew (Node 1980 >> Node 1981) +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + -- Node 1980: fully warmed (weight 100) + -- Node 1981: just started (weight ~1) + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Warm-up check: 1980=", count_80, ", 1981=", count_81) + + -- Expect heavy skew to 1980 + if count_80 >= 7 and count_81 <= 3 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 4: wait for warm-up to complete +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local upstream = require("apisix.upstream").get_by_id(1) + + if upstream and upstream.nodes then + local max_update_time = 0 + for _, node in ipairs(upstream.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + + if max_update_time > 0 then + local now = ngx.time() + local warm_up_duration = upstream.warm_up_conf.slow_start_time_seconds + local elapsed = now - max_update_time + + if elapsed < warm_up_duration then + ngx.sleep(warm_up_duration - elapsed) + end + end + else + ngx.say("cannot find upstream id: 1") + end + + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 5: verify balanced traffic after warm-up +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Balanced check: 1980=", count_80, ", 1981=", count_81) + + -- Expect balanced traffic + if count_80 == count_81 then Review Comment: The test uses a strict equality check (count_80 == count_81) to verify balanced traffic after warm-up. With only 10 requests, there's a significant probability of natural variance causing the test to fail even when the load balancing is working correctly. For roundrobin with equal weights, a 6-4 or 7-3 split is statistically possible and doesn't indicate a bug. Consider using a more lenient assertion (e.g., |count_80 - count_81| <= 2) or increasing the number of requests to reduce variance and make the test more reliable. ```suggestion -- Expect approximately balanced traffic (allow small variance) if math.abs(count_80 - count_81) <= 2 then ``` ########## t/node/warm-up.t: ########## @@ -0,0 +1,663 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +repeat_each(1); +log_level('warn'); +no_root_location(); +no_shuffle(); + +run_tests(); + +__DATA__ + +=== TEST 1: create upstream with single node (starts warm-up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + -- 1. Create Upstream + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100, "update_time": ]] .. ngx.time() - 10 .. [[} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say("upstream failed: ", body) + return + end + + -- 2. Create Route using Upstream + code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/server_port", + "upstream_id": "1" + }]] + ) + + if code >= 300 then + ngx.status = code + ngx.say("route failed: ", body) + return + end + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 2: add new node (auto-detects as new and warms up) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). + -- The new node (1981) should get a new update_time (so it starts warming up). + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "127.0.0.1", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 3: verify warm-up traffic skew (Node 1980 >> Node 1981) +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + -- Node 1980: fully warmed (weight 100) + -- Node 1981: just started (weight ~1) + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Warm-up check: 1980=", count_80, ", 1981=", count_81) + + -- Expect heavy skew to 1980 + if count_80 >= 7 and count_81 <= 3 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 4: wait for warm-up to complete +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local upstream = require("apisix.upstream").get_by_id(1) + + if upstream and upstream.nodes then + local max_update_time = 0 + for _, node in ipairs(upstream.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + + if max_update_time > 0 then + local now = ngx.time() + local warm_up_duration = upstream.warm_up_conf.slow_start_time_seconds + local elapsed = now - max_update_time + + if elapsed < warm_up_duration then + ngx.sleep(warm_up_duration - elapsed) + end + end + else + ngx.say("cannot find upstream id: 1") + end + + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 5: verify balanced traffic after warm-up +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Balanced check: 1980=", count_80, ", 1981=", count_81) + + -- Expect balanced traffic + if count_80 == count_81 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 6: add domain node +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + -- We update the upstream with a new node. + -- The existing node (1980) should preserve its update_time (so it stays fully warmed). + -- The new node (1981) should get a new update_time (so it starts warming up). + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "localhost", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 7: verify warm-up traffic skew for domain(Node 1980 << Node 1981) +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + -- Node 1980: fully warmed (weight 100) + -- Node 1981: just started (weight ~1) + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Warm-up check: 1980=", count_80, ", 1981=", count_81) + + -- Expect heavy skew to 1980 + if count_80 <= 3 and count_81 >= 7 then + ngx.say("passed") + else + ngx.say("failed: 1980=" .. count_80 .. ", 1981=" .. count_81) + end + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 8: wait for warm-up to complete for domain +--- timeout: 10 +--- config + location /t { + content_by_lua_block { + local upstream = require("apisix.upstream").get_by_id(1) + + if upstream and upstream.nodes then + local max_update_time = 0 + for _, node in ipairs(upstream.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + + if max_update_time > 0 then + local now = ngx.time() + local warm_up_duration = upstream.warm_up_conf.slow_start_time_seconds + local elapsed = now - max_update_time + + if elapsed < warm_up_duration then + ngx.sleep(warm_up_duration - elapsed) + end + end + else + ngx.say("cannot find upstream id: 1") + end + + ngx.say("passed") + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 9: update upstream again, update_time should remain unchanged +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/upstreams/1', + ngx.HTTP_PUT, + [[{ + "id": "1", + "type": "roundrobin", + "nodes": [ + {"host": "localhost", "port": 1980, "weight": 100}, + {"host": "127.0.0.1", "port": 1981, "weight": 100} + ], + "warm_up_conf": { + "slow_start_time_seconds": 5, + "min_weight_percent": 1, + "aggression": 1.0 + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed + + + +=== TEST 10: verify balanced traffic after warm-up for domain +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port" + + local ports_count = {} + + for i = 1, 10 do + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + ports_count[res.body] = (ports_count[res.body] or 0) + 1 + end + + local count_80 = ports_count["1980"] or 0 + local count_81 = ports_count["1981"] or 0 + + ngx.log(ngx.INFO, "Balanced check: 1980=", count_80, ", 1981=", count_81) + + -- Expect balanced traffic + if count_80 == count_81 then Review Comment: The test uses a strict equality check (count_80 == count_81) to verify balanced traffic after warm-up. With only 10 requests, there's a significant probability of natural variance causing the test to fail even when the load balancing is working correctly. For roundrobin with equal weights, a 6-4 or 7-3 split is statistically possible and doesn't indicate a bug. Consider using a more lenient assertion (e.g., |count_80 - count_81| <= 2) or increasing the number of requests to reduce variance and make the test more reliable. ########## apisix/balancer.lua: ########## @@ -178,6 +197,35 @@ local function set_balancer_opts(route, ctx) end +local function get_version_with_warm_up(version, up_conf) + if not up_conf.warm_up_conf or up_conf.warm_up_conf.finish then + return version + end + local warm_up_conf = up_conf.warm_up_conf + local warm_up_end_time = warm_up_conf.warm_up_end_time + if not warm_up_end_time then + local max_update_time = 0 + for _, node in ipairs(up_conf.nodes) do + if node.update_time and node.update_time > max_update_time then + max_update_time = node.update_time + end + end + if max_update_time > 0 then + warm_up_end_time = max_update_time + warm_up_conf.slow_start_time_seconds + warm_up_conf.warm_up_end_time = warm_up_end_time + end + end + + if warm_up_end_time and ngx_time() < warm_up_end_time then + version = version .. math_floor(ngx_time() / warm_up_conf.interval) Review Comment: The version calculation uses string concatenation with `math_floor(ngx_time() / warm_up_conf.interval)`. The default interval is 1 second according to the schema. However, if the interval is set to a very large value (e.g., 3600 seconds), the version would only update once per hour, meaning weight calculations would be cached for an hour even though weights should be recalculated based on elapsed time. This could cause nodes to remain at reduced weights longer than intended during the warm-up period. Consider adding documentation or validation to ensure interval values are appropriate for the warm-up duration, or ensure the cache invalidation logic accounts for this scenario. ```suggestion local interval = warm_up_conf.interval or 1 if interval <= 0 then interval = 1 end if warm_up_conf.slow_start_time_seconds and interval > warm_up_conf.slow_start_time_seconds then interval = warm_up_conf.slow_start_time_seconds end version = version .. math_floor(ngx_time() / interval) ``` ########## apisix/admin/resource.lua: ########## @@ -125,6 +125,10 @@ function _M:check_conf(id, conf, need_id, typ, allow_time) local conf_for_check = tbl_deepcopy(conf) local ok, err = self.checker(id, conf_for_check, need_id, self.schema, {secret_type = typ}) + if self.initialize_conf then + self.initialize_conf(id, conf) + end Review Comment: The `initialize_conf` hook is called after schema validation but before `encrypt_conf`. This ordering means that `initialize_conf` modifies the configuration by adding `update_time` fields to nodes, but this modification happens after the schema check has already passed. If the schema validation made a deep copy and validated that, the actual `conf` being stored might have different fields than what was validated. While this appears intentional for this feature, it's worth verifying that modifying `conf` after validation doesn't violate any assumptions in the etcd storage layer or cause issues with configuration comparison logic elsewhere in the codebase. ########## docs/en/latest/admin-api.md: ########## @@ -1023,6 +1023,10 @@ In addition to the equalization algorithm selections, Upstream also supports pas | keepalive_pool.size | False | Auxiliary | Sets `keepalive` directive dynamically. | | | keepalive_pool.idle_timeout | False | Auxiliary | Sets `keepalive_timeout` directive dynamically. | | | keepalive_pool.requests | False | Auxiliary | Sets `keepalive_requests` directive dynamically. | | +| warm_up_conf.slow_start_time_seconds | True | Integer | Slow start window in seconds. | 30 | +| warm_up_conf.min_weight_percent | True | Integer | Minimum weight percentage at the start of warm-up (1-100). | 1 | +| warm_up_conf.aggression | False | Number | Controls the rate of traffic increase during slow start, default is 1. | 1.0 | +| warm_up_conf.interval | False | Integer | Weight update interval in seconds. | 1 | Review Comment: The documentation table shows "True" in what appears to be the "Required" column for all warm_up_conf fields, but according to the schema definition in apisix/schema_def.lua line 444, only "slow_start_time_seconds" and "min_weight_percent" are required fields. The "aggression" and "interval" fields are optional with default values. The table should indicate "False" for aggression and interval in the Required column to accurately reflect the schema. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
