This is an automated email from the ASF dual-hosted git repository.
cmcfarlen pushed a commit to branch 10.0.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/10.0.x by this push:
new 388565d009 Fixes to HostDB marking IPs up/down (#11814)
388565d009 is described below
commit 388565d009fbea88108a57715f4bfaedbcf0ae1a
Author: Chris McFarlen <[email protected]>
AuthorDate: Thu Oct 10 19:19:18 2024 -0500
Fixes to HostDB marking IPs up/down (#11814)
* Fixes to hostdb marking IPs up/down
* Mark HostDBInfo::select const
* Update dns_host_down test to reflect fixed behavior for returning 500
---------
Co-authored-by: Chris McFarlen <[email protected]>
(cherry picked from commit ffa0b26e037aa39a7b4c759955b7f9bf017ae928)
---
include/iocore/hostdb/HostDBProcessor.h | 31 +++++++--
src/iocore/hostdb/HostDB.cc | 73 +++++++++++-----------
src/proxy/http/HttpSM.cc | 49 +++++++--------
src/proxy/http/HttpTransact.cc | 4 +-
tests/gold_tests/dns/dns_host_down.test.py | 3 +-
.../gold_tests/dns/replay/server_down.replay.yaml | 4 +-
6 files changed, 91 insertions(+), 73 deletions(-)
diff --git a/include/iocore/hostdb/HostDBProcessor.h
b/include/iocore/hostdb/HostDBProcessor.h
index 30d572a497..a532e23e49 100644
--- a/include/iocore/hostdb/HostDBProcessor.h
+++ b/include/iocore/hostdb/HostDBProcessor.h
@@ -25,6 +25,7 @@
#include <chrono>
#include <atomic>
+#include <utility>
#include "tscore/HashFNV.h"
#include "tscore/ink_time.h"
@@ -150,7 +151,7 @@ struct HostDBInfo {
* If a zombie is selected the failure time is updated to make it appear
down to other threads in a thread safe
* manner. The caller should check @c last_fail_time to see if a zombie was
selected.
*/
- bool select(ts_time now, ts_seconds fail_window);
+ bool select(ts_time now, ts_seconds fail_window) const;
/// Check if this info is valid.
bool is_valid() const;
@@ -167,6 +168,8 @@ struct HostDBInfo {
*/
bool mark_down(ts_time now);
+ std::pair<bool, uint8_t> increment_fail_count(ts_time now, uint8_t
max_retries);
+
/** Mark the target as up / alive.
*
* @return Previous alive state of the target.
@@ -243,8 +246,12 @@ HostDBInfo::is_down(ts_time now, ts_seconds fail_window)
inline bool
HostDBInfo::mark_up()
{
- auto t = last_failure.exchange(TS_TIME_ZERO);
- return t != TS_TIME_ZERO;
+ auto t = last_failure.exchange(TS_TIME_ZERO);
+ bool was_down = t != TS_TIME_ZERO;
+ if (was_down) {
+ fail_count.store(0);
+ }
+ return was_down;
}
inline bool
@@ -254,21 +261,33 @@ HostDBInfo::mark_down(ts_time now)
return last_failure.compare_exchange_strong(t0, now);
}
+inline std::pair<bool, uint8_t>
+HostDBInfo::increment_fail_count(ts_time now, uint8_t max_retries)
+{
+ auto fcount = ++fail_count;
+ bool marked_down = false;
+ if (fcount >= max_retries) {
+ marked_down = mark_down(now);
+ }
+ return std::make_pair(marked_down, fcount);
+}
+
inline bool
-HostDBInfo::select(ts_time now, ts_seconds fail_window)
+HostDBInfo::select(ts_time now, ts_seconds fail_window) const
{
auto t0 = this->last_fail_time();
if (t0 == TS_TIME_ZERO) {
return true; // it's alive and so is valid for selection.
}
- // Success means this is a zombie and this thread updated the failure time.
- return (t0 + fail_window < now) && last_failure.compare_exchange_strong(t0,
now);
+ // Return true and give it a try if enough time is elapsed since the last
failure
+ return (t0 + fail_window < now);
}
inline void
HostDBInfo::migrate_from(HostDBInfo::self_type const &that)
{
this->last_failure = that.last_failure.load();
+ this->fail_count = that.fail_count.load();
this->http_version = that.http_version;
}
diff --git a/src/iocore/hostdb/HostDB.cc b/src/iocore/hostdb/HostDB.cc
index 361f79e83d..d1b597d626 100644
--- a/src/iocore/hostdb/HostDB.cc
+++ b/src/iocore/hostdb/HostDB.cc
@@ -1418,46 +1418,49 @@ HostDBRecord::select_best_http(ts_time now, ts_seconds
fail_window, sockaddr con
HostDBInfo *best_alive = nullptr;
auto info{this->rr_info()};
-
- if (HostDBProcessor::hostdb_strict_round_robin) {
- // Always select the next viable target - select failure means no valid
targets at all.
- best_alive = best_any = this->select_next_rr(now, fail_window);
- Dbg(dbg_ctl_hostdb, "Using strict round robin - index %d",
this->index_of(best_alive));
- } else if (HostDBProcessor::hostdb_timed_round_robin > 0) {
- auto ctime = rr_ctime.load(); // cache for atomic update.
- auto ntime = ctime + ts_seconds(HostDBProcessor::hostdb_timed_round_robin);
- // Check and update RR if it's time - this always yields a valid target if
there is one.
- if (now > ntime && rr_ctime.compare_exchange_strong(ctime, ntime)) {
+ if (info.count() > 1) {
+ if (HostDBProcessor::hostdb_strict_round_robin) {
+ // Always select the next viable target - select failure means no valid
targets at all.
best_alive = best_any = this->select_next_rr(now, fail_window);
- Dbg(dbg_ctl_hostdb, "Round robin timed interval expired - index %d",
this->index_of(best_alive));
- } else { // pick the current index, which may be down.
- best_any = &info[this->rr_idx()];
- }
- Dbg(dbg_ctl_hostdb, "Using timed round robin - index %d",
this->index_of(best_any));
- } else {
- // Walk the entries and find the best (largest) hash.
- unsigned int best_hash = 0; // any hash is better than this.
- for (auto &target : info) {
- unsigned int h = HOSTDB_CLIENT_IP_HASH(hash_addr, target.data.ip);
- if (best_hash <= h) {
- best_any = ⌖
- best_hash = h;
+ Dbg(dbg_ctl_hostdb, "Using strict round robin - index %d",
this->index_of(best_alive));
+ } else if (HostDBProcessor::hostdb_timed_round_robin > 0) {
+ auto ctime = rr_ctime.load(); // cache for atomic update.
+ auto ntime = ctime +
ts_seconds(HostDBProcessor::hostdb_timed_round_robin);
+ // Check and update RR if it's time - this always yields a valid target
if there is one.
+ if (now > ntime && rr_ctime.compare_exchange_strong(ctime, ntime)) {
+ best_alive = best_any = this->select_next_rr(now, fail_window);
+ Dbg(dbg_ctl_hostdb, "Round robin timed interval expired - index %d",
this->index_of(best_alive));
+ } else { // pick the current index, which may be down.
+ best_any = &info[this->rr_idx()];
}
+ Dbg(dbg_ctl_hostdb, "Using timed round robin - index %d",
this->index_of(best_any));
+ } else {
+ // Walk the entries and find the best (largest) hash.
+ unsigned int best_hash = 0; // any hash is better than this.
+ for (auto &target : info) {
+ unsigned int h = HOSTDB_CLIENT_IP_HASH(hash_addr, target.data.ip);
+ if (best_hash <= h) {
+ best_any = ⌖
+ best_hash = h;
+ }
+ }
+ Dbg(dbg_ctl_hostdb, "Using client affinity - index %d",
this->index_of(best_any));
}
- Dbg(dbg_ctl_hostdb, "Using client affinity - index %d",
this->index_of(best_any));
- }
-
- // If there is a base choice, search for valid target starting there.
- // Otherwise there is no valid target in the record.
- if (best_any && !best_alive) {
- // Starting at the current target, search for a valid one.
- for (unsigned short i = 0; i < rr_count; i++) {
- auto target = &info[this->rr_idx(i)];
- if (target->select(now, fail_window)) {
- best_alive = target;
- break;
+
+ // If there is a base choice, search for valid target starting there.
+ // Otherwise there is no valid target in the record.
+ if (best_any && !best_alive) {
+ // Starting at the current target, search for a valid one.
+ for (unsigned short i = 0; i < rr_count; i++) {
+ auto target = &info[this->rr_idx(i)];
+ if (target->select(now, fail_window)) {
+ best_alive = target;
+ break;
+ }
}
}
+ } else {
+ best_alive = &info[0];
}
return best_alive;
diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc
index 225296bf48..89ce9647b6 100644
--- a/src/proxy/http/HttpSM.cc
+++ b/src/proxy/http/HttpSM.cc
@@ -4555,12 +4555,14 @@ HttpSM::do_hostdb_update_if_necessary()
this->mark_host_failure(&t_state.dns_info,
ts_clock::from_time_t(t_state.client_request_time));
} else {
if (t_state.dns_info.mark_active_server_alive()) {
+ char addrbuf[INET6_ADDRPORTSTRLEN];
+ ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf));
+ ATS_PROBE2(mark_active_server_alive, sm_id, addrbuf);
if (t_state.dns_info.record->is_srv()) {
- SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking SRV: %s as
up", sm_id, t_state.dns_info.record->name());
+ SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking SRV: %s(%s)
as up", sm_id, t_state.dns_info.record->name(),
+ addrbuf);
} else {
- char addrbuf[INET6_ADDRPORTSTRLEN];
- SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking IP: %s as
up", sm_id,
- ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf)));
+ SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking IP: %s as
up", sm_id, addrbuf);
}
}
}
@@ -5768,30 +5770,26 @@ HttpSM::mark_host_failure(ResolveInfo *info, ts_time
time_down)
if (info->active) {
if (time_down != TS_TIME_ZERO) {
+ ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf));
// Increment the fail_count
- if (++info->active->fail_count >=
t_state.txn_conf->connect_attempts_rr_retries) {
- if (info->active) {
- if (info->active->last_failure.load() == TS_TIME_ZERO) {
- char *url_str =
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
- int host_len;
- const char *host_name_ptr =
t_state.unmapped_url.host_get(&host_len);
- std::string_view host_name{host_name_ptr, size_t(host_len)};
- swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {}
for host='{}' url='{}' marking down",
-
swoc::bwf::Errno(t_state.current.server->connect_result),
t_state.current.server->dst_addr, host_name,
- swoc::bwf::FirstOf(url_str, "<none>"));
- Log::error("%s", error_bw_buffer.c_str());
- }
- info->active->last_failure = time_down;
- SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down",
- ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf)));
- } else {
- SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d",
- ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf)), info->active->fail_count.load());
- }
+ if (auto [down, fail_count] =
info->active->increment_fail_count(time_down,
t_state.txn_conf->connect_attempts_rr_retries);
+ down) {
+ char *url_str =
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
+ int host_len;
+ const char *host_name_ptr =
t_state.unmapped_url.host_get(&host_len);
+ std::string_view host_name{host_name_ptr,
static_cast<size_t>(host_len)};
+ swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for
host='{}' url='{}' fail_count='{}' marking down",
+
swoc::bwf::Errno(t_state.current.server->connect_result),
t_state.current.server->dst_addr, host_name,
+ swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
+ Log::error("%s", error_bw_buffer.c_str());
+ SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
+ ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
+ } else {
+ ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
+ SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf,
fail_count);
}
} else { // Clear the failure
- info->active->fail_count = 0;
- info->active->last_failure = time_down;
+ info->active->mark_up();
}
}
#ifdef DEBUG
@@ -8118,6 +8116,7 @@ HttpSM::set_next_state()
case HttpTransact::SM_ACTION_ORIGIN_SERVER_RR_MARK_DOWN: {
HTTP_SM_SET_DEFAULT_HANDLER(&HttpSM::state_mark_os_down);
+ ATS_PROBE(next_state_SM_ACTION_ORIGIN_SERVER_RR_MARK_DOWN);
ink_assert(t_state.dns_info.looking_up == ResolveInfo::ORIGIN_SERVER);
diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc
index 8c1fb59570..cb49e0d481 100644
--- a/src/proxy/http/HttpTransact.cc
+++ b/src/proxy/http/HttpTransact.cc
@@ -47,6 +47,7 @@
#include "proxy/http/HttpBodyFactory.h"
#include "proxy/IPAllow.h"
#include "iocore/utils/Machine.h"
+#include "ts/ats_probe.h"
DbgCtl HttpTransact::State::_dbg_ctl{"http"};
@@ -521,8 +522,7 @@ HttpTransact::is_server_negative_cached(State *s)
// down to 2*down_server_timeout
if (s->dns_info.active &&
ts_clock::from_time_t(s->client_request_time) +
s->txn_conf->down_server_timeout < s->dns_info.active->last_fail_time()) {
- s->dns_info.active->last_failure = TS_TIME_ZERO;
- s->dns_info.active->fail_count = 0;
+ s->dns_info.active->mark_up();
ink_assert(!"extreme clock skew");
return true;
}
diff --git a/tests/gold_tests/dns/dns_host_down.test.py
b/tests/gold_tests/dns/dns_host_down.test.py
index 1bb5dc2b8d..74930c7544 100644
--- a/tests/gold_tests/dns/dns_host_down.test.py
+++ b/tests/gold_tests/dns/dns_host_down.test.py
@@ -81,9 +81,8 @@ class DownCachedOriginServerTest:
os.path.join(Test.Variables.AtsTestToolsDir, 'condwait') + ' 60 1
-f ' +
os.path.join(self._ts.Variables.LOGDIR, 'error.log'))
- self._ts.Disk.error_log.Content =
Testers.ContainsExpression("/dns/mark/down' marking down", "host should be
marked down")
self._ts.Disk.error_log.Content = Testers.ContainsExpression(
- "DNS Error: no valid server http://resolve.this.com", "DNS lookup
should fail")
+ "/dns/mark/down' fail_count='1' marking down", "host should be
marked down")
def run(self):
self._test_host_mark_down()
diff --git a/tests/gold_tests/dns/replay/server_down.replay.yaml
b/tests/gold_tests/dns/replay/server_down.replay.yaml
index d1dd1e2022..d5b33bc784 100644
--- a/tests/gold_tests/dns/replay/server_down.replay.yaml
+++ b/tests/gold_tests/dns/replay/server_down.replay.yaml
@@ -55,7 +55,5 @@ sessions:
server-response:
status: 200
- # Previous request marked host down so DNS lookup returns unsuccessful
- # 500 response indicates no valid server
proxy-response:
- status: 500
+ status: 502