This is an automated email from the ASF dual-hosted git repository.
hossman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new bca4cd630b9 SOLR-17652: Fix a bug that could cause long leader
elections to leave PULL replicas in DOWN state forever
bca4cd630b9 is described below
commit bca4cd630b9cff66ecc0431397a99f5289a6462b
Author: Chris Hostetter <[email protected]>
AuthorDate: Wed Feb 5 12:11:30 2025 -0700
SOLR-17652: Fix a bug that could cause long leader elections to leave PULL
replicas in DOWN state forever
---
solr/CHANGES.txt | 2 ++
.../java/org/apache/solr/cloud/ZkController.java | 24 ++++++++++++++--------
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 524e194260f..3c7a443f790 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -185,6 +185,8 @@ Bug Fixes
* SOLR-17637: LBHttp2SolrClient can fail to complete async requests in certain
error scenarios.
This can cause the HttpShardHandler to indefinitely wait on a completed
response that will never come. (Houston Putman)
+* SOLR-17652: Fix a bug that could cause long leader elections to leave PULL
replicas in DOWN state forever. (hossman)
+
Dependency Upgrades
---------------------
* SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index e81a4a20c2b..07c1b1ad4d7 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -1315,16 +1315,21 @@ public class ZkController implements Closeable {
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "",
e);
}
- // in this case, we want to wait for the leader as long as the leader
might
- // wait for a vote, at least - but also long enough that a large cluster
has
- // time to get its act together
- String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
+ final String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
- String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
- log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
- boolean isLeader = leaderUrl.equals(ourUrl);
- assert !isLeader || replica.getType().leaderEligible
- : replica.getType().name() + " replica became leader!";
+ // Check if we are the (new) leader before deciding if/what type of
recovery to do
+ boolean isLeader = false;
+ if (replica.getType().leaderEligible) {
+ // if are eligible to be a leader, then we might currently be
participating in leader
+ // election.
+
+ // in this case, we want to wait for the leader as long as the leader
might
+ // wait for a vote, at least - but also long enough that a large
cluster has
+ // time to get its act together
+ String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
+ log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
+ isLeader = leaderUrl.equals(ourUrl);
+ }
try (SolrCore core = cc.getCore(desc.getName())) {
@@ -1368,6 +1373,7 @@ public class ZkController implements Closeable {
}
}
}
+
boolean didRecovery =
checkRecovery(
recoverReloadedCores,