This is an automated email from the ASF dual-hosted git repository.
traeak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push:
new 0c052b124a nexthop health status don't overflow fail count (#13164)
0c052b124a is described below
commit 0c052b124a3b4fb26967a07815cca3fef7bd0495
Author: Brian Olsen <[email protected]>
AuthorDate: Thu May 14 14:09:46 2026 -0600
nexthop health status don't overflow fail count (#13164)
---
src/proxy/http/remap/NextHopHealthStatus.cc | 24 +++++---
.../remap/unit-tests/test_NextHopRoundRobin.cc | 69 ++++++++++++++++++++++
2 files changed, 86 insertions(+), 7 deletions(-)
diff --git a/src/proxy/http/remap/NextHopHealthStatus.cc
b/src/proxy/http/remap/NextHopHealthStatus.cc
index 6c86c26bf3..35e42aa611 100644
--- a/src/proxy/http/remap/NextHopHealthStatus.cc
+++ b/src/proxy/http/remap/NextHopHealthStatus.cc
@@ -120,8 +120,12 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const char
*hostname, const int
new_fail_count = h->failCount = 1;
}
} else if (result.retry == true) {
- h->failedAt = _now;
- new_fail_count = h->failCount += 1;
+ h->failedAt = _now;
+ if (h->failCount < UINT32_MAX) {
+ new_fail_count = ++h->failCount;
+ } else {
+ new_fail_count = UINT32_MAX;
+ }
}
} // end lock guard
@@ -132,19 +136,25 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const
char *hostname, const int
{ // lock guard
std::lock_guard<std::mutex> lock(h->_mutex);
if ((h->failedAt.load() + retry_time) < _now) {
+ h->failedAt = _now;
new_fail_count = h->failCount = 1;
- h->failedAt = _now;
} else {
- new_fail_count = h->failCount += 1;
+ if (h->failCount < UINT32_MAX) {
+ new_fail_count = ++h->failCount;
+ } else {
+ new_fail_count = UINT32_MAX;
+ }
}
} // end of lock_guard
- NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] Parent fail count increased to %d for
%s", sm_id, new_fail_count, h->hostname.c_str());
+ NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] Parent fail count increased to %"
PRIu32 " for %s", sm_id, new_fail_count,
+ h->hostname.c_str());
}
if (new_fail_count >= fail_threshold) {
h->set_unavailable();
- NH_Note("[%" PRId64 "] Failure threshold met failcount:%d >=
threshold:%" PRId64 ", http parent proxy %s marked down", sm_id,
- new_fail_count, fail_threshold, h->hostname.c_str());
+ NH_Note("[%" PRId64 "] Failure threshold met failcount:%" PRIu32 " >=
threshold:%" PRId64
+ ", http parent proxy %s marked down",
+ sm_id, new_fail_count, fail_threshold, h->hostname.c_str());
NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] NextHop %s marked unavailable,
h->available=%s", sm_id, h->hostname.c_str(),
(h->available.load()) ? "true" : "false");
}
diff --git a/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
b/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
index 908dbc35e5..52ae3b511f 100644
--- a/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
+++ b/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
@@ -381,3 +381,72 @@ SCENARIO("Testing NextHopRoundRobin class, using policy
'latched'", "[NextHopRou
}
}
}
+
+SCENARIO("Testing NextHopHealthStatus failCount overflow saturation",
"[NextHopHealthStatus]")
+{
+ // No thread setup, forbid use of thread local allocators.
+ cmd_disable_pfreelist = true;
+ http_init();
+
+ GIVEN("Loading the round-robin-tests.yaml config for overflow test.")
+ {
+ NextHopStrategyFactory nhf(TS_SRC_DIR "/round-robin-tests.yaml");
+ NextHopSelectionStrategy *const strategy =
nhf.strategyInstance("rr-strict-exhaust-ring");
+
+ REQUIRE(nhf.strategies_loaded == true);
+ REQUIRE(strategy != nullptr);
+
+ WHEN("failCount is near UINT32_MAX and markNextHop is called")
+ {
+ HttpSM sm;
+ ParentResult *result = &sm.t_state.parent_result;
+ TSHttpTxn txnp = reinterpret_cast<TSHttpTxn>(&sm);
+
+ // Select a host so result is populated.
+ build_request(20001, &sm, nullptr, "rabbit.net", nullptr);
+ strategy->findNextHop(txnp);
+ REQUIRE(result->result == ParentResultType::SPECIFIED);
+ REQUIRE(result->hostname != nullptr);
+
+ // Get the HostRecord for the selected host.
+ std::shared_ptr<HostRecord> host_rec;
+ for (auto &group : strategy->host_groups) {
+ for (auto &h : group) {
+ if (h->hostname == result->hostname) {
+ host_rec = h;
+ break;
+ }
+ }
+ if (host_rec) {
+ break;
+ }
+ }
+ REQUIRE(host_rec != nullptr);
+
+ // Set failCount to UINT32_MAX - 1 to test saturation.
+ host_rec->failCount = UINT32_MAX - 1;
+ host_rec->failedAt = time(nullptr);
+
+ // Set fail_threshold very high so set_unavailable doesn't interfere.
+ extern char _my_txn_conf[];
+ auto *oride =
reinterpret_cast<OverridableHttpConfigParams *>(_my_txn_conf);
+ oride->parent_fail_threshold = static_cast<int64_t>(UINT32_MAX) + 1;
+ oride->parent_retry_time = 30; // large retry window so we stay in
the increment path
+
+ THEN("failCount saturates at UINT32_MAX and does not wrap to 0")
+ {
+ // First markNextHop: UINT32_MAX-1 -> UINT32_MAX
+ strategy->markNextHop(txnp, result->hostname, result->port,
NHCmd::MARK_DOWN);
+ CHECK(host_rec->failCount.load() == UINT32_MAX);
+
+ // Second markNextHop: should stay at UINT32_MAX (saturated)
+ strategy->markNextHop(txnp, result->hostname, result->port,
NHCmd::MARK_DOWN);
+ CHECK(host_rec->failCount.load() == UINT32_MAX);
+
+ // Verify host is still available (threshold not met since threshold >
UINT32_MAX)
+ CHECK(host_rec->available.load() == true);
+ }
+ br_destroy(sm);
+ }
+ }
+}