This is an automated email from the ASF dual-hosted git repository.

chenBright pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/brpc.git


The following commit(s) were added to refs/heads/master by this push:
     new 6ddefcef Roll back LocalityAwareLoadBalancer to gettimeofday_us to 
match callers (#3283)
6ddefcef is described below

commit 6ddefcef87974b9e9f1072a4f5c7b6fda8db882d
Author: huangjun <[email protected]>
AuthorDate: Sun Apr 26 17:44:07 2026 +0800

    Roll back LocalityAwareLoadBalancer to gettimeofday_us to match callers 
(#3283)
    
    PR #3268 ("Use monotonic time instead of wall time") switched
    LocalityAwareLoadBalancer::Weight::Update's end_time_us and
    LocalityAwareLoadBalancer::Describe's now to butil::cpuwide_time_us(),
    but every caller that supplies CallInfo::begin_time_us still uses
    butil::gettimeofday_us():
    
      - Channel::CallMethod (channel.cpp:451) -> Controller::IssueRPC ->
        Controller::Call::begin_time_us -> SelectIn::begin_time_us ->
        CallInfo::begin_time_us
      - Controller::OnVersionedRPCReturned retry sites
        (controller.cpp:672, 715) call IssueRPC(gettimeofday_us()) on
        backup-request and regular retries
    
    The mismatched time domains make
    
        latency = end_time_us - ci.begin_time_us
                = cpuwide_now - wallclock_begin
                ~= -1.7e15 us
    
    trigger the `if (latency <= 0) { /* time skews, ignore */ return 0; }`
    short-circuit on every call. _time_q never accumulates samples,
    _avg_latency stays at 0, and locality-aware weight feedback is silently
    disabled.
    
    Visible downstream symptom: cold-start `list://` channels with `lb=la`
    and 2 backends occasionally fail RPCs with EHOSTDOWN ("Fail to select
    server from list://...") on retry even when one backend is healthy.
    Bisected reproduction in xsky/brpc fork:
    
      - 51 commit range c41e838..604dad0c (1.16.1 .. 1.17.0-rc2)
      - master code + LA-driven multipath probe at 2 backends, max_retry=1,
        repeat 500x:
          * commit 771de31e (one before #3268): 0/500 fail
          * commit 12fb539a (#3268):           25/500 fail
          * commit 12fb539a + revert only Weight::Update::end_time_us to
            gettimeofday_us:                    0/500 fail
    
    This commit reverts the LA-side of #3268's clock change so the LB lines
    up with its existing callers again. Channel::CallMethod and the retry
    paths in Controller stay on butil::gettimeofday_us(), which preserves
    the wall-clock semantics of Controller::_begin_time_us /
    Controller::latency_us() that public users rely on.
    
    Adds 
test/brpc_load_balancer_unittest.cpp::la_records_latency_with_consistent_time_source
    which drives a series of SelectServer + Feedback cycles against
    LocalityAwareLoadBalancer (no Server / Channel needed) and asserts
    that _avg_latency reflects the elapsed time, rather than being stuck
    at 0 because of a time-source mismatch.
    
    Co-authored-by: huangjun <[email protected]>
---
 src/brpc/policy/locality_aware_load_balancer.cpp |  6 +-
 test/brpc_load_balancer_unittest.cpp             | 77 ++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/brpc/policy/locality_aware_load_balancer.cpp 
b/src/brpc/policy/locality_aware_load_balancer.cpp
index 455f6fc3..beea5169 100644
--- a/src/brpc/policy/locality_aware_load_balancer.cpp
+++ b/src/brpc/policy/locality_aware_load_balancer.cpp
@@ -18,7 +18,7 @@
 
 #include <limits>                                            // numeric_limits
 #include <gflags/gflags.h>
-#include "butil/time.h"                                      // cpuwide_time_us
+#include "butil/time.h"                                       // 
gettimeofday_us
 #include "butil/fast_rand.h"
 #include "brpc/log.h"
 #include "brpc/socket.h"
@@ -376,7 +376,7 @@ void LocalityAwareLoadBalancer::Feedback(const CallInfo& 
info) {
 
 int64_t LocalityAwareLoadBalancer::Weight::Update(
     const CallInfo& ci, size_t index) {
-    const int64_t end_time_us = butil::cpuwide_time_us();
+    const int64_t end_time_us = butil::gettimeofday_us();
     const int64_t latency = end_time_us - ci.begin_time_us;
     BAIDU_SCOPED_LOCK(_mutex);
     if (Disabled()) {
@@ -524,7 +524,7 @@ void LocalityAwareLoadBalancer::Describe(
     if (_db_servers.Read(&s) != 0) {
         os << "fail to read _db_servers";
     } else {
-        const int64_t now = butil::cpuwide_time_us();
+        const int64_t now = butil::gettimeofday_us();
         const size_t n = s->weight_tree.size();
         os << '[';
         for (size_t i = 0; i < n; ++i) {
diff --git a/test/brpc_load_balancer_unittest.cpp 
b/test/brpc_load_balancer_unittest.cpp
index 2a2be242..76ad005e 100644
--- a/test/brpc_load_balancer_unittest.cpp
+++ b/test/brpc_load_balancer_unittest.cpp
@@ -1303,6 +1303,83 @@ TEST_F(LoadBalancerTest, 
revived_from_all_failed_intergrated) {
 }
 #endif // BUTIL_USE_ASAN
 
+// Regression for #3268's incomplete migration of LocalityAwareLoadBalancer.
+//
+// #3268 switched `LocalityAwareLoadBalancer::Weight::Update::end_time_us` and
+// `LocalityAwareLoadBalancer::Describe::now` to `butil::cpuwide_time_us()`
+// while every caller that supplies `CallInfo::begin_time_us` (the RPC entry
+// in `Channel::CallMethod` and the retry sites in
+// `Controller::OnVersionedRPCReturned`) still uses `butil::gettimeofday_us()`.
+// The resulting time-source mismatch makes
+//
+//     latency = end_time_us - ci.begin_time_us
+//             = cpuwide_now - wallclock_begin
+//             ~= -1.7e15 us  (huge negative)
+//
+// trigger the
+//
+//     if (latency <= 0) { /* time skews, ignore the sample */ return 0; }
+//
+// short-circuit on every call. `_time_q` never accumulates samples,
+// `_avg_latency` stays at 0, and locality-aware weight feedback is silently
+// disabled. Visible downstream symptom: cold-start `list://` channels with
+// `lb=la` and 2 backends occasionally fail RPCs with `EHOSTDOWN`
+// ("Fail to select server") on retry even when one backend is healthy.
+//
+// This commit reverts the LA side of #3268, so `Weight::Update` and
+// `Describe` once again use `butil::gettimeofday_us()` to match every
+// existing caller of `CallInfo::begin_time_us`.
+//
+// The test below runs entirely against `LocalityAwareLoadBalancer` (no
+// Server / Channel is involved), so it is hermetic. It supplies a
+// gettimeofday-based `begin_time_us` (matching what `Channel::CallMethod`
+// passes today) and asserts that the LB records a positive `_avg_latency`,
+// rather than tripping the time-skew short-circuit.
+TEST_F(LoadBalancerTest, la_records_latency_with_consistent_time_source) {
+    LALB lalb;
+    char addr[] = "192.168.1.1:8080";
+    butil::EndPoint dummy;
+    ASSERT_EQ(0, str2endpoint(addr, &dummy));
+    brpc::ServerId id(8888);
+    brpc::SocketOptions options;
+    options.remote_side = dummy;
+    ASSERT_EQ(0, brpc::Socket::Create(options, &id.id));
+    ASSERT_TRUE(lalb.AddServer(id));
+
+    auto avg_latency = [&]() -> int64_t {
+        std::ostringstream os;
+        brpc::DescribeOptions opts;
+        opts.verbose = true;
+        lalb.Describe(os, opts);
+        const std::string s = os.str();
+        const size_t p = s.find("avg_latency=");
+        if (p == std::string::npos) return -1;
+        return strtoll(s.c_str() + p + strlen("avg_latency="), NULL, 10);
+    };
+
+    // Drive a few "RPCs": pick a server, sleep ~2ms, feed back. begin_time_us
+    // comes from gettimeofday_us(), matching what Channel::CallMethod and the
+    // retry sites in Controller::OnVersionedRPCReturned pass on every RPC.
+    for (int i = 0; i < 8; ++i) {
+        const int64_t begin_us = butil::gettimeofday_us();
+        brpc::SocketUniquePtr ptr;
+        brpc::LoadBalancer::SelectIn in = { begin_us, true, false, 0u, NULL };
+        brpc::LoadBalancer::SelectOut out(&ptr);
+        ASSERT_EQ(0, lalb.SelectServer(in, &out));
+        bthread_usleep(2000);
+        brpc::LoadBalancer::CallInfo ci = { begin_us, id.id, 0, NULL };
+        lalb.Feedback(ci);
+    }
+
+    // _avg_latency must reflect actual elapsed time. If this is 0, either
+    // Weight::Update::end_time_us was changed away from gettimeofday_us
+    // again (re-introducing the time-source mismatch) or some caller of
+    // CallInfo::begin_time_us drifted to a different clock domain.
+    EXPECT_GT(avg_latency(), 0);
+
+    ASSERT_EQ(0, brpc::Socket::SetFailed(id.id));
+}
+
 TEST_F(LoadBalancerTest, la_selection_too_long) {
     brpc::GlobalInitializeOrDie();
     brpc::LoadBalancerWithNaming lb;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to