This is an automated email from the ASF dual-hosted git repository.
lhotari pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pulsar.git
The following commit(s) were added to refs/heads/master by this push:
new 4229e197479 [fix][client] Reset higher-index states on recovery in
SameAuthParamsLookupAutoClusterFailover (#25826)
4229e197479 is described below
commit 4229e197479e8ea10164c30abf95c41ceff2b344
Author: Matteo Merli <[email protected]>
AuthorDate: Wed May 20 06:48:58 2026 -0700
[fix][client] Reset higher-index states on recovery in
SameAuthParamsLookupAutoClusterFailover (#25826)
---
.../SameAuthParamsLookupAutoClusterFailover.java | 11 ++++
...ameAuthParamsLookupAutoClusterFailoverTest.java | 63 ++++++++++++++++++++++
2 files changed, 74 insertions(+)
diff --git
a/pulsar-client/src/main/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailover.java
b/pulsar-client/src/main/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailover.java
index 414d362b4da..8abc0984c91 100644
---
a/pulsar-client/src/main/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailover.java
+++
b/pulsar-client/src/main/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailover.java
@@ -267,6 +267,17 @@ public class SameAuthParamsLookupAutoClusterFailover
implements ServiceUrlProvid
try {
pulsarClient.updateServiceUrl(targetUrl);
pulsarClient.reloadLookUp();
+ // When recovering to a higher-priority service, the check loop
will only probe
+ // indices 0..targetIndex going forward. Any transient state
(e.g., PreFail from
+ // a single timed-out probe) at higher indices would become stuck
because those
+ // indices are no longer probed. Reset them so they start fresh if
a future
+ // failover needs to consider them again.
+ if (targetIndex < currentPulsarServiceIndex) {
+ for (int i = targetIndex + 1; i <
pulsarServiceStateArray.length; i++) {
+ pulsarServiceStateArray[i] = PulsarServiceState.Healthy;
+ checkCounterArray[i].setValue(0);
+ }
+ }
currentPulsarServiceIndex = targetIndex;
} catch (Exception e) {
log.error().attr("logMsg", logMsg).exception(e).log("Failed to");
diff --git
a/pulsar-client/src/test/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailoverTest.java
b/pulsar-client/src/test/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailoverTest.java
index a2470752fd9..d872dfc5672 100644
---
a/pulsar-client/src/test/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailoverTest.java
+++
b/pulsar-client/src/test/java/org/apache/pulsar/client/impl/SameAuthParamsLookupAutoClusterFailoverTest.java
@@ -167,6 +167,69 @@ public class SameAuthParamsLookupAutoClusterFailoverTest {
"Should stay at index 2, not bounce to index 1"));
}
+ /**
+ * Reproduces the bug fixed by resetting state of
higher-priority-than-target indices on
+ * recovery. The check loop only probes indices
0..currentPulsarServiceIndex, so if a
+ * higher index was left in a transient state (e.g., PreFail from a single
timed-out
+ * probe) at the moment of recovery, it would stay there forever because
no future probe
+ * ever visits it.
+ *
+ * <p>Scenario: failover 0 -> 2 (state[2]=Healthy). url1 recovers and is
about to trigger
+ * recovery 2 -> 1. On the same check cycle that promotes state[1] to
Healthy, url2 sees
+ * one transient probe failure that flips state[2] from Healthy to
PreFail. The recovery
+ * fires (current 2 -> 1) and from that point on the loop only probes
indices 0 and 1.
+ *
+ * <p>Without the fix, state[2] stays at PreFail forever. With the fix,
state[2] is
+ * reset to Healthy when the recovery transition runs.
+ */
+ @Test(timeOut = 30000)
+ public void testRecoveryResetsHigherIndexStaleState() throws Exception {
+ // url0 down, url1 down, url2 up.
+ setLookupResult(URL0, false);
+ setLookupResult(URL1, false);
+ setLookupResult(URL2, true);
+
+ // Pre-set state[0]=Failed and trigger failover 0 -> 2.
+ runOnExecutor(() -> {
+ stateArray[0] = PulsarServiceState.Failed;
+ counterArray[0].setValue(0);
+ });
+ runCheckCycle();
+ runOnExecutor(() -> {
+ assertEquals(failover.getCurrentPulsarServiceIndex(), 2);
+ assertEquals(stateArray[1], PulsarServiceState.Failed);
+ assertEquals(stateArray[2], PulsarServiceState.Healthy);
+ });
+
+ // url1 becomes healthy; first check cycle moves state[1] Failed ->
PreRecover.
+ setLookupResult(URL1, true);
+ runCheckCycle();
+ runOnExecutor(() -> {
+ assertEquals(failover.getCurrentPulsarServiceIndex(), 2);
+ assertEquals(stateArray[1], PulsarServiceState.PreRecover);
+ assertEquals(stateArray[2], PulsarServiceState.Healthy);
+ });
+
+ // On the cycle that completes recovery (state[1] PreRecover ->
Healthy and triggers
+ // updateServiceUrl(1)), inject a single failed probe at url2 so
state[2] flips
+ // Healthy -> PreFail just before the index transition.
+ setLookupResult(URL2, false);
+ runCheckCycle();
+
+ // After recovery to index 1:
+ // - With the fix: state[2] is reset to Healthy on the transition.
+ // - Without the fix: state[2] is stuck at PreFail forever — the
check loop now
+ // only iterates 0..1 and never visits index 2
again.
+ runOnExecutor(() -> {
+ assertEquals(failover.getCurrentPulsarServiceIndex(), 1);
+ assertEquals(stateArray[1], PulsarServiceState.Healthy);
+ assertEquals(stateArray[2], PulsarServiceState.Healthy,
+ "state[2] should be reset to Healthy on recovery, not
stuck at PreFail");
+ assertEquals(counterArray[2].intValue(), 0,
+ "counter[2] should be reset to 0 on recovery");
+ });
+ }
+
/**
* Verifies that recovery still works correctly for a service that was
marked Failed
* by findFailoverTo, once that service becomes available again.