This is an automated email from the ASF dual-hosted git repository.

traeak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c052b124a nexthop health status don't overflow fail count (#13164)
0c052b124a is described below

commit 0c052b124a3b4fb26967a07815cca3fef7bd0495
Author: Brian Olsen <[email protected]>
AuthorDate: Thu May 14 14:09:46 2026 -0600

    nexthop health status don't overflow fail count (#13164)
---
 src/proxy/http/remap/NextHopHealthStatus.cc        | 24 +++++---
 .../remap/unit-tests/test_NextHopRoundRobin.cc     | 69 ++++++++++++++++++++++
 2 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/src/proxy/http/remap/NextHopHealthStatus.cc 
b/src/proxy/http/remap/NextHopHealthStatus.cc
index 6c86c26bf3..35e42aa611 100644
--- a/src/proxy/http/remap/NextHopHealthStatus.cc
+++ b/src/proxy/http/remap/NextHopHealthStatus.cc
@@ -120,8 +120,12 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const char 
*hostname, const int
             new_fail_count = h->failCount = 1;
           }
         } else if (result.retry == true) {
-          h->failedAt    = _now;
-          new_fail_count = h->failCount += 1;
+          h->failedAt = _now;
+          if (h->failCount < UINT32_MAX) {
+            new_fail_count = ++h->failCount;
+          } else {
+            new_fail_count = UINT32_MAX;
+          }
         }
       } // end lock guard
 
@@ -132,19 +136,25 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const 
char *hostname, const int
       { // lock guard
         std::lock_guard<std::mutex> lock(h->_mutex);
         if ((h->failedAt.load() + retry_time) < _now) {
+          h->failedAt    = _now;
           new_fail_count = h->failCount = 1;
-          h->failedAt                   = _now;
         } else {
-          new_fail_count = h->failCount += 1;
+          if (h->failCount < UINT32_MAX) {
+            new_fail_count = ++h->failCount;
+          } else {
+            new_fail_count = UINT32_MAX;
+          }
         }
       } // end of lock_guard
-      NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] Parent fail count increased to %d for 
%s", sm_id, new_fail_count, h->hostname.c_str());
+      NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] Parent fail count increased to %" 
PRIu32 " for %s", sm_id, new_fail_count,
+             h->hostname.c_str());
     }
 
     if (new_fail_count >= fail_threshold) {
       h->set_unavailable();
-      NH_Note("[%" PRId64 "] Failure threshold met failcount:%d >= 
threshold:%" PRId64 ", http parent proxy %s marked down", sm_id,
-              new_fail_count, fail_threshold, h->hostname.c_str());
+      NH_Note("[%" PRId64 "] Failure threshold met failcount:%" PRIu32 " >= 
threshold:%" PRId64
+              ", http parent proxy %s marked down",
+              sm_id, new_fail_count, fail_threshold, h->hostname.c_str());
       NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] NextHop %s marked unavailable, 
h->available=%s", sm_id, h->hostname.c_str(),
              (h->available.load()) ? "true" : "false");
     }
diff --git a/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc 
b/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
index 908dbc35e5..52ae3b511f 100644
--- a/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
+++ b/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc
@@ -381,3 +381,72 @@ SCENARIO("Testing NextHopRoundRobin class, using policy 
'latched'", "[NextHopRou
     }
   }
 }
+
+SCENARIO("Testing NextHopHealthStatus failCount overflow saturation", 
"[NextHopHealthStatus]")
+{
+  // No thread setup, forbid use of thread local allocators.
+  cmd_disable_pfreelist = true;
+  http_init();
+
+  GIVEN("Loading the round-robin-tests.yaml config for overflow test.")
+  {
+    NextHopStrategyFactory          nhf(TS_SRC_DIR "/round-robin-tests.yaml");
+    NextHopSelectionStrategy *const strategy = 
nhf.strategyInstance("rr-strict-exhaust-ring");
+
+    REQUIRE(nhf.strategies_loaded == true);
+    REQUIRE(strategy != nullptr);
+
+    WHEN("failCount is near UINT32_MAX and markNextHop is called")
+    {
+      HttpSM        sm;
+      ParentResult *result = &sm.t_state.parent_result;
+      TSHttpTxn     txnp   = reinterpret_cast<TSHttpTxn>(&sm);
+
+      // Select a host so result is populated.
+      build_request(20001, &sm, nullptr, "rabbit.net", nullptr);
+      strategy->findNextHop(txnp);
+      REQUIRE(result->result == ParentResultType::SPECIFIED);
+      REQUIRE(result->hostname != nullptr);
+
+      // Get the HostRecord for the selected host.
+      std::shared_ptr<HostRecord> host_rec;
+      for (auto &group : strategy->host_groups) {
+        for (auto &h : group) {
+          if (h->hostname == result->hostname) {
+            host_rec = h;
+            break;
+          }
+        }
+        if (host_rec) {
+          break;
+        }
+      }
+      REQUIRE(host_rec != nullptr);
+
+      // Set failCount to UINT32_MAX - 1 to test saturation.
+      host_rec->failCount = UINT32_MAX - 1;
+      host_rec->failedAt  = time(nullptr);
+
+      // Set fail_threshold very high so set_unavailable doesn't interfere.
+      extern char _my_txn_conf[];
+      auto       *oride            = 
reinterpret_cast<OverridableHttpConfigParams *>(_my_txn_conf);
+      oride->parent_fail_threshold = static_cast<int64_t>(UINT32_MAX) + 1;
+      oride->parent_retry_time     = 30; // large retry window so we stay in 
the increment path
+
+      THEN("failCount saturates at UINT32_MAX and does not wrap to 0")
+      {
+        // First markNextHop: UINT32_MAX-1 -> UINT32_MAX
+        strategy->markNextHop(txnp, result->hostname, result->port, 
NHCmd::MARK_DOWN);
+        CHECK(host_rec->failCount.load() == UINT32_MAX);
+
+        // Second markNextHop: should stay at UINT32_MAX (saturated)
+        strategy->markNextHop(txnp, result->hostname, result->port, 
NHCmd::MARK_DOWN);
+        CHECK(host_rec->failCount.load() == UINT32_MAX);
+
+        // Verify host is still available (threshold not met since threshold > 
UINT32_MAX)
+        CHECK(host_rec->available.load() == true);
+      }
+      br_destroy(sm);
+    }
+  }
+}

Reply via email to