This is an automated email from the ASF dual-hosted git repository.
psalagnac pushed a commit to branch branch_9_8
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9_8 by this push:
new 7cf2aa2637c SOLR-17405: allow a single thread to reestablish ZK
session (#2914)
7cf2aa2637c is described below
commit 7cf2aa2637c40f07f9c789589b742748ff1685ec
Author: Pierre Salagnac <[email protected]>
AuthorDate: Fri Dec 20 10:27:05 2024 +0100
SOLR-17405: allow a single thread to reestablish ZK session (#2914)
---
solr/CHANGES.txt | 3 +++
.../org/apache/solr/common/cloud/ConnectionManager.java | 2 +-
.../java/org/apache/solr/common/cloud/SolrZkClient.java | 15 ++++++++++++++-
3 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 83ba7f4cdc5..ad5c1e07c20 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -121,6 +121,9 @@ Bug Fixes
* SOLR-17306: fix replication problem on follower restart (Martin Anzinger and
Peter Kroiss via Eric Pugh)
+* SOLR-17405: Fix race condition where Zookeeper session could be
re-established by multiple threads concurrently in
+ case of frequent session expirations. (Pierre Salagnac)
+
Dependency Upgrades
---------------------
* PR#2702: chore(deps): update io.netty:* to v4.1.114.final (solrbot)
diff --git
a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ConnectionManager.java
b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ConnectionManager.java
index ba527b1eef8..fd84bb05950 100644
---
a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ConnectionManager.java
+++
b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/ConnectionManager.java
@@ -179,7 +179,7 @@ public class ConnectionManager implements Watcher {
connectionStrategy.reconnect(
zkServerAddress,
client.getZkClientTimeout(),
- this,
+ client.wrapWatcher(this),
new ZkClientConnectionStrategy.ZkUpdate() {
@Override
public void update(ZooKeeper keeper) {
diff --git
a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/SolrZkClient.java
b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/SolrZkClient.java
index d7a6e5649d7..27d72e825e7 100644
---
a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/SolrZkClient.java
+++
b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/SolrZkClient.java
@@ -218,6 +218,7 @@ public class SolrZkClient implements Closeable {
} catch (InterruptedException e1) {
Thread.currentThread().interrupt();
}
+ zkCallbackExecutor.shutdown();
zkConnManagerCallbackExecutor.shutdown();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
@@ -1077,7 +1078,19 @@ public class SolrZkClient implements Closeable {
public void process(final WatchedEvent event) {
log.debug("Submitting job to respond to event {}", event);
try {
- if (watcher instanceof ConnectionManager) {
+ // We want all the code that re-creates the Zookeeper session and then
invoke
+ // ZkController.onReconnect() to never be executed by two threads
concurrently.
+ // Pool 'zkConnManagerCallbackExecutor' is single threaded. We make
sure such events
+ // are processed only by this pool. Consequently, in case of a session
expiration, we
+ // don't try to re-create a new session until the previous call to
onReconnect()
+ // returned.
+ //
+ // All other events goes to pool 'zkCallbackExecutor', which is
unbounded and may
+ // spawn as many threads as there are events to process.
+ // This includes event on ConnectionManager others than session
expiration. Consequently,
+ // there is no deadlock when the thread currently reestablishing the
session waits for
+ // the 'SyncConnected' event.
+ if (watcher instanceof ConnectionManager && event.getState() ==
Event.KeeperState.Expired) {
zkConnManagerCallbackExecutor.execute(() -> watcher.process(event));
} else {
zkCallbackExecutor.execute(