This is an automated email from the ASF dual-hosted git repository.
hossman pushed a commit to branch branch_9_8
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9_8 by this push:
new 59aaa8b75e8 SOLR-17652: Fix a bug that could cause long leader
elections to leave PULL replicas in DOWN state forever
59aaa8b75e8 is described below
commit 59aaa8b75e8c7820af50a59adbfafa7398e96d24
Author: Chris Hostetter <[email protected]>
AuthorDate: Wed Feb 5 12:11:30 2025 -0700
SOLR-17652: Fix a bug that could cause long leader elections to leave PULL
replicas in DOWN state forever
(cherry picked from commit bca4cd630b9cff66ecc0431397a99f5289a6462b)
(cherry picked from commit 9f6d7a8274ce344ae17d5405cc174085cf6be430)
---
solr/CHANGES.txt | 2 ++
.../java/org/apache/solr/cloud/ZkController.java | 23 ++++++++++++++--------
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3da8e752454..e0b8509e0d5 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -14,6 +14,8 @@ Bug Fixes
Users can still opt-in by providing a "indexSearcherExecutorThreads" > 0.
(Houston Putman, Varun Thacker, David Smiley, Luke Kot-Zaniewski)
+* SOLR-17652: Fix a bug that could cause long leader elections to leave PULL
replicas in DOWN state forever. (hossman)
+
Dependency Upgrades
---------------------
(No changes)
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 07e7743e457..5f164a522b1 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1332,15 +1332,21 @@ public class ZkController implements Closeable {
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "",
e);
}
- // in this case, we want to wait for the leader as long as the leader
might
- // wait for a vote, at least - but also long enough that a large cluster
has
- // time to get its act together
- String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
+ final String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
- String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
- log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
- boolean isLeader = leaderUrl.equals(ourUrl);
- assert !(isLeader && replica.getType() == Type.PULL) : "Pull replica
became leader!";
+ // Check if we are the (new) leader before deciding if/what type of
recovery to do
+ boolean isLeader = false;
+ if (replica.getType().leaderEligible) {
+ // if are eligible to be a leader, then we might currently be
participating in leader
+ // election.
+
+ // in this case, we want to wait for the leader as long as the leader
might
+ // wait for a vote, at least - but also long enough that a large
cluster has
+ // time to get its act together
+ String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
+ log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
+ isLeader = leaderUrl.equals(ourUrl);
+ }
try (SolrCore core = cc.getCore(desc.getName())) {
@@ -1384,6 +1390,7 @@ public class ZkController implements Closeable {
}
}
}
+
boolean didRecovery =
checkRecovery(
recoverReloadedCores,