This is an automated email from the ASF dual-hosted git repository.
chenBright pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/brpc.git
The following commit(s) were added to refs/heads/master by this push:
new 6ddefcef Roll back LocalityAwareLoadBalancer to gettimeofday_us to
match callers (#3283)
6ddefcef is described below
commit 6ddefcef87974b9e9f1072a4f5c7b6fda8db882d
Author: huangjun <[email protected]>
AuthorDate: Sun Apr 26 17:44:07 2026 +0800
Roll back LocalityAwareLoadBalancer to gettimeofday_us to match callers
(#3283)
PR #3268 ("Use monotonic time instead of wall time") switched
LocalityAwareLoadBalancer::Weight::Update's end_time_us and
LocalityAwareLoadBalancer::Describe's now to butil::cpuwide_time_us(),
but every caller that supplies CallInfo::begin_time_us still uses
butil::gettimeofday_us():
- Channel::CallMethod (channel.cpp:451) -> Controller::IssueRPC ->
Controller::Call::begin_time_us -> SelectIn::begin_time_us ->
CallInfo::begin_time_us
- Controller::OnVersionedRPCReturned retry sites
(controller.cpp:672, 715) call IssueRPC(gettimeofday_us()) on
backup-request and regular retries
The mismatched time domains make
latency = end_time_us - ci.begin_time_us
= cpuwide_now - wallclock_begin
~= -1.7e15 us
trigger the `if (latency <= 0) { /* time skews, ignore */ return 0; }`
short-circuit on every call. _time_q never accumulates samples,
_avg_latency stays at 0, and locality-aware weight feedback is silently
disabled.
Visible downstream symptom: cold-start `list://` channels with `lb=la`
and 2 backends occasionally fail RPCs with EHOSTDOWN ("Fail to select
server from list://...") on retry even when one backend is healthy.
Bisected reproduction in xsky/brpc fork:
- 51 commit range c41e838..604dad0c (1.16.1 .. 1.17.0-rc2)
- master code + LA-driven multipath probe at 2 backends, max_retry=1,
repeat 500x:
* commit 771de31e (one before #3268): 0/500 fail
* commit 12fb539a (#3268): 25/500 fail
* commit 12fb539a + revert only Weight::Update::end_time_us to
gettimeofday_us: 0/500 fail
This commit reverts the LA-side of #3268's clock change so the LB lines
up with its existing callers again. Channel::CallMethod and the retry
paths in Controller stay on butil::gettimeofday_us(), which preserves
the wall-clock semantics of Controller::_begin_time_us /
Controller::latency_us() that public users rely on.
Adds
test/brpc_load_balancer_unittest.cpp::la_records_latency_with_consistent_time_source
which drives a series of SelectServer + Feedback cycles against
LocalityAwareLoadBalancer (no Server / Channel needed) and asserts
that _avg_latency reflects the elapsed time, rather than being stuck
at 0 because of a time-source mismatch.
Co-authored-by: huangjun <[email protected]>
---
src/brpc/policy/locality_aware_load_balancer.cpp | 6 +-
test/brpc_load_balancer_unittest.cpp | 77 ++++++++++++++++++++++++
2 files changed, 80 insertions(+), 3 deletions(-)
diff --git a/src/brpc/policy/locality_aware_load_balancer.cpp
b/src/brpc/policy/locality_aware_load_balancer.cpp
index 455f6fc3..beea5169 100644
--- a/src/brpc/policy/locality_aware_load_balancer.cpp
+++ b/src/brpc/policy/locality_aware_load_balancer.cpp
@@ -18,7 +18,7 @@
#include <limits> // numeric_limits
#include <gflags/gflags.h>
-#include "butil/time.h" // cpuwide_time_us
+#include "butil/time.h" //
gettimeofday_us
#include "butil/fast_rand.h"
#include "brpc/log.h"
#include "brpc/socket.h"
@@ -376,7 +376,7 @@ void LocalityAwareLoadBalancer::Feedback(const CallInfo&
info) {
int64_t LocalityAwareLoadBalancer::Weight::Update(
const CallInfo& ci, size_t index) {
- const int64_t end_time_us = butil::cpuwide_time_us();
+ const int64_t end_time_us = butil::gettimeofday_us();
const int64_t latency = end_time_us - ci.begin_time_us;
BAIDU_SCOPED_LOCK(_mutex);
if (Disabled()) {
@@ -524,7 +524,7 @@ void LocalityAwareLoadBalancer::Describe(
if (_db_servers.Read(&s) != 0) {
os << "fail to read _db_servers";
} else {
- const int64_t now = butil::cpuwide_time_us();
+ const int64_t now = butil::gettimeofday_us();
const size_t n = s->weight_tree.size();
os << '[';
for (size_t i = 0; i < n; ++i) {
diff --git a/test/brpc_load_balancer_unittest.cpp
b/test/brpc_load_balancer_unittest.cpp
index 2a2be242..76ad005e 100644
--- a/test/brpc_load_balancer_unittest.cpp
+++ b/test/brpc_load_balancer_unittest.cpp
@@ -1303,6 +1303,83 @@ TEST_F(LoadBalancerTest,
revived_from_all_failed_intergrated) {
}
#endif // BUTIL_USE_ASAN
+// Regression for #3268's incomplete migration of LocalityAwareLoadBalancer.
+//
+// #3268 switched `LocalityAwareLoadBalancer::Weight::Update::end_time_us` and
+// `LocalityAwareLoadBalancer::Describe::now` to `butil::cpuwide_time_us()`
+// while every caller that supplies `CallInfo::begin_time_us` (the RPC entry
+// in `Channel::CallMethod` and the retry sites in
+// `Controller::OnVersionedRPCReturned`) still uses `butil::gettimeofday_us()`.
+// The resulting time-source mismatch makes
+//
+// latency = end_time_us - ci.begin_time_us
+// = cpuwide_now - wallclock_begin
+// ~= -1.7e15 us (huge negative)
+//
+// trigger the
+//
+// if (latency <= 0) { /* time skews, ignore the sample */ return 0; }
+//
+// short-circuit on every call. `_time_q` never accumulates samples,
+// `_avg_latency` stays at 0, and locality-aware weight feedback is silently
+// disabled. Visible downstream symptom: cold-start `list://` channels with
+// `lb=la` and 2 backends occasionally fail RPCs with `EHOSTDOWN`
+// ("Fail to select server") on retry even when one backend is healthy.
+//
+// This commit reverts the LA side of #3268, so `Weight::Update` and
+// `Describe` once again use `butil::gettimeofday_us()` to match every
+// existing caller of `CallInfo::begin_time_us`.
+//
+// The test below runs entirely against `LocalityAwareLoadBalancer` (no
+// Server / Channel is involved), so it is hermetic. It supplies a
+// gettimeofday-based `begin_time_us` (matching what `Channel::CallMethod`
+// passes today) and asserts that the LB records a positive `_avg_latency`,
+// rather than tripping the time-skew short-circuit.
+TEST_F(LoadBalancerTest, la_records_latency_with_consistent_time_source) {
+ LALB lalb;
+ char addr[] = "192.168.1.1:8080";
+ butil::EndPoint dummy;
+ ASSERT_EQ(0, str2endpoint(addr, &dummy));
+ brpc::ServerId id(8888);
+ brpc::SocketOptions options;
+ options.remote_side = dummy;
+ ASSERT_EQ(0, brpc::Socket::Create(options, &id.id));
+ ASSERT_TRUE(lalb.AddServer(id));
+
+ auto avg_latency = [&]() -> int64_t {
+ std::ostringstream os;
+ brpc::DescribeOptions opts;
+ opts.verbose = true;
+ lalb.Describe(os, opts);
+ const std::string s = os.str();
+ const size_t p = s.find("avg_latency=");
+ if (p == std::string::npos) return -1;
+ return strtoll(s.c_str() + p + strlen("avg_latency="), NULL, 10);
+ };
+
+ // Drive a few "RPCs": pick a server, sleep ~2ms, feed back. begin_time_us
+ // comes from gettimeofday_us(), matching what Channel::CallMethod and the
+ // retry sites in Controller::OnVersionedRPCReturned pass on every RPC.
+ for (int i = 0; i < 8; ++i) {
+ const int64_t begin_us = butil::gettimeofday_us();
+ brpc::SocketUniquePtr ptr;
+ brpc::LoadBalancer::SelectIn in = { begin_us, true, false, 0u, NULL };
+ brpc::LoadBalancer::SelectOut out(&ptr);
+ ASSERT_EQ(0, lalb.SelectServer(in, &out));
+ bthread_usleep(2000);
+ brpc::LoadBalancer::CallInfo ci = { begin_us, id.id, 0, NULL };
+ lalb.Feedback(ci);
+ }
+
+ // _avg_latency must reflect actual elapsed time. If this is 0, either
+ // Weight::Update::end_time_us was changed away from gettimeofday_us
+ // again (re-introducing the time-source mismatch) or some caller of
+ // CallInfo::begin_time_us drifted to a different clock domain.
+ EXPECT_GT(avg_latency(), 0);
+
+ ASSERT_EQ(0, brpc::Socket::SetFailed(id.id));
+}
+
TEST_F(LoadBalancerTest, la_selection_too_long) {
brpc::GlobalInitializeOrDie();
brpc::LoadBalancerWithNaming lb;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]