This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new b46bc0cc1c9 SOLR-17667: Simplify zombie logic in LBSolrClient (#3176)
b46bc0cc1c9 is described below

commit b46bc0cc1c937c3e822382c74410b5e83fd9b920
Author: Houston Putman <[email protected]>
AuthorDate: Mon Feb 17 13:54:06 2025 -0600

    SOLR-17667: Simplify zombie logic in LBSolrClient (#3176)
---
 solr/CHANGES.txt                                   |   2 +
 .../solr/client/solrj/impl/LBHttp2SolrClient.java  |  14 +--
 .../solr/client/solrj/impl/LBHttpSolrClient.java   |   2 +-
 .../solr/client/solrj/impl/LBSolrClient.java       | 139 ++++++++++-----------
 .../solr/client/solrj/TestLBHttp2SolrClient.java   |   2 +
 5 files changed, 81 insertions(+), 78 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 29902084134..9c53bd1effe 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -91,6 +91,8 @@ Other Changes
 * SOLR-17671: Replication and backup have their DirectoryFactory.DirContext so 
the directory they use is unwrapped
   when copying files. (Bruno Roustant, David Smiley)
 
+* SOLR-17667: Simplify zombie server logic in LBSolrClient (Houston Putman)
+
 ==================  9.8.0 ==================
 New Features
 ---------------------
diff --git 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
index 14b869fd41a..20b6e3f2e6a 100644
--- 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
+++ 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java
@@ -317,7 +317,7 @@ public class LBHttp2SolrClient extends LBSolrClient {
       RetryListener listener) {
     rsp.rsp = result;
     if (isZombie) {
-      zombieServers.remove(endpoint);
+      reviveZombieServer(endpoint);
     }
     listener.onSuccess(rsp);
   }
@@ -336,32 +336,32 @@ public class LBHttp2SolrClient extends LBSolrClient {
       // we retry on 404 or 403 or 503 or 500
       // unless it's an update - then we only retry on connect exception
       if (!isNonRetryable && RETRY_CODES.contains(e.code())) {
-        listener.onFailure((!isZombie) ? addZombie(endpoint, e) : e, true);
+        listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
       } else {
         // Server is alive but the request was likely malformed or invalid
         if (isZombie) {
-          zombieServers.remove(endpoint);
+          reviveZombieServer(endpoint);
         }
         listener.onFailure(e, false);
       }
     } catch (SocketException e) {
       if (!isNonRetryable || e instanceof ConnectException) {
-        listener.onFailure((!isZombie) ? addZombie(endpoint, e) : e, true);
+        listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
       } else {
         listener.onFailure(e, false);
       }
     } catch (SocketTimeoutException e) {
       if (!isNonRetryable) {
-        listener.onFailure((!isZombie) ? addZombie(endpoint, e) : e, true);
+        listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
       } else {
         listener.onFailure(e, false);
       }
     } catch (SolrServerException e) {
       Throwable rootCause = e.getRootCause();
       if (!isNonRetryable && rootCause instanceof IOException) {
-        listener.onFailure((!isZombie) ? addZombie(endpoint, e) : e, true);
+        listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
       } else if (isNonRetryable && rootCause instanceof ConnectException) {
-        listener.onFailure((!isZombie) ? addZombie(endpoint, e) : e, true);
+        listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
       } else {
         listener.onFailure(e, false);
       }
diff --git 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
index cfaf35e86f7..6486516d109 100644
--- 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
+++ 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttpSolrClient.java
@@ -192,7 +192,7 @@ public class LBHttpSolrClient extends LBSolrClient {
    */
   @Deprecated
   @Override
-  public String removeSolrServer(String server) {
+  public synchronized String removeSolrServer(String server) {
     urlToClient.remove(server);
     return super.removeSolrServer(server);
   }
diff --git 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
index 0b4f3a1523a..d81ee864e31 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
@@ -71,7 +71,7 @@ public abstract class LBSolrClient extends SolrClient {
 
   public static final String UPDATE_LIVE_SERVER_MESSAGE = "Updated alive 
server list";
 
-  private static final String UPDATE_LIVE_SERVER_LOG = 
UPDATE_LIVE_SERVER_MESSAGE + ": {}";
+  private static final String UPDATE_LIVE_SERVER_LOG = 
UPDATE_LIVE_SERVER_MESSAGE + " for {}: {}";
 
   // defaults
   protected static final Set<Integer> RETRY_CODES =
@@ -81,8 +81,7 @@ public abstract class LBSolrClient extends SolrClient {
 
   // keys to the maps are currently of the form "http://localhost:8983/solr";
   // which should be equivalent to HttpSolrServer.getBaseURL()
-  private final Map<String, ServerWrapper> aliveServers = new 
LinkedHashMap<>();
-  // access to aliveServers should be synchronized on itself
+  private final Map<String, ServerWrapper> allServers = new LinkedHashMap<>();
 
   protected final Map<String, ServerWrapper> zombieServers = new 
ConcurrentHashMap<>();
 
@@ -192,11 +191,6 @@ public abstract class LBSolrClient extends SolrClient {
   protected static class ServerWrapper {
     final String baseUrl;
 
-    // "standard" servers are used by default.  They normally live in the 
alive list
-    // and move to the zombie list when unavailable.  When they become 
available again,
-    // they move back to the alive list.
-    boolean standard = true;
-
     int failedPings = 0;
 
     ServerWrapper(String baseUrl) {
@@ -423,7 +417,7 @@ public abstract class LBSolrClient extends SolrClient {
     if (!baseSolrUrls.isEmpty()) {
       for (String s : baseSolrUrls) {
         ServerWrapper wrapper = createServerWrapper(s);
-        aliveServers.put(wrapper.getBaseUrl(), wrapper);
+        allServers.put(wrapper.getBaseUrl(), wrapper);
       }
       updateAliveList();
     }
@@ -431,15 +425,21 @@ public abstract class LBSolrClient extends SolrClient {
   }
 
   protected void updateAliveList() {
-    synchronized (aliveServers) {
-      aliveServerList = aliveServers.values().toArray(new ServerWrapper[0]);
+    synchronized (allServers) {
+      aliveServerList =
+          allServers.values().stream()
+              .filter(server -> !zombieServers.containsKey(server.baseUrl))
+              .toArray(ServerWrapper[]::new);
       if (log.isDebugEnabled()) {
-        log.debug(UPDATE_LIVE_SERVER_LOG, Arrays.toString(aliveServerList));
+        log.debug(UPDATE_LIVE_SERVER_LOG, this, 
Arrays.toString(aliveServerList));
       }
     }
   }
 
   protected ServerWrapper createServerWrapper(String baseUrl) {
+    if (allServers.containsKey(baseUrl)) {
+      return allServers.get(baseUrl);
+    }
     return new ServerWrapper(baseUrl);
   }
 
@@ -512,8 +512,7 @@ public abstract class LBSolrClient extends SolrClient {
       rsp.server = baseUrl;
       rsp.rsp = doRequest(baseUrl, req.getRequest(), null);
       if (isZombie) {
-        // TODO: zombieServers key is String not Endpoint.
-        zombieServers.remove(baseUrl);
+        reviveZombieServer(baseUrl);
       }
     } catch (BaseHttpSolrClient.RemoteExecutionException e) {
       throw e;
@@ -521,32 +520,32 @@ public abstract class LBSolrClient extends SolrClient {
       // we retry on 404 or 403 or 503 or 500
       // unless it's an update - then we only retry on connect exception
       if (!isNonRetryable && RETRY_CODES.contains(e.code())) {
-        ex = (!isZombie) ? addZombie(baseUrl, e) : e;
+        ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
       } else {
         // Server is alive but the request was likely malformed or invalid
         if (isZombie) {
-          zombieServers.remove(baseUrl);
+          reviveZombieServer(baseUrl);
         }
         throw e;
       }
     } catch (SocketException e) {
       if (!isNonRetryable || e instanceof ConnectException) {
-        ex = (!isZombie) ? addZombie(baseUrl, e) : e;
+        ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
       } else {
         throw e;
       }
     } catch (SocketTimeoutException e) {
       if (!isNonRetryable) {
-        ex = (!isZombie) ? addZombie(baseUrl, e) : e;
+        ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
       } else {
         throw e;
       }
     } catch (SolrServerException e) {
       Throwable rootCause = e.getRootCause();
       if (!isNonRetryable && rootCause instanceof IOException) {
-        ex = (!isZombie) ? addZombie(baseUrl, e) : e;
+        ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
       } else if (isNonRetryable && rootCause instanceof ConnectException) {
-        ex = (!isZombie) ? addZombie(baseUrl, e) : e;
+        ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
       } else {
         throw e;
       }
@@ -580,14 +579,6 @@ public abstract class LBSolrClient extends SolrClient {
 
   protected abstract SolrClient getClient(Endpoint endpoint);
 
-  protected Exception addZombie(String serverStr, Exception e) {
-    ServerWrapper wrapper = createServerWrapper(serverStr);
-    wrapper.standard = false;
-    zombieServers.put(serverStr, wrapper);
-    startAliveCheckExecutor();
-    return e;
-  }
-
   /**
    * LBHttpSolrServer keeps pinging the dead servers at fixed interval to find 
if it is alive. Use
    * this to set that interval
@@ -612,6 +603,7 @@ public abstract class LBSolrClient extends SolrClient {
     if (aliveCheckExecutor == null) {
       synchronized (this) {
         if (aliveCheckExecutor == null) {
+          log.debug("Starting aliveCheckExecutor for {}", this);
           aliveCheckExecutor =
               Executors.newSingleThreadScheduledExecutor(
                   new SolrNamedThreadFactory("aliveCheckExecutor"));
@@ -670,25 +662,14 @@ public abstract class LBSolrClient extends SolrClient {
 
   private void checkAZombieServer(ServerWrapper zombieServer) {
     try {
+      log.debug("Checking zombie server {} for {}", zombieServer, this);
       QueryRequest queryRequest = new QueryRequest(solrQuery);
       final var responseRaw = doRequest(zombieServer.getBaseUrl(), 
queryRequest, null);
       final var resp = new QueryResponse();
       resp.setResponse(responseRaw);
       if (resp.getStatus() == 0) {
         // server has come back up.
-        // make sure to remove from zombies before adding to the alive list to 
avoid a race
-        // condition
-        // where another thread could mark it down, move it back to zombie, 
and then we delete
-        // from zombie and lose it forever.
-        ServerWrapper wrapper = 
zombieServers.remove(zombieServer.getBaseUrl());
-        if (wrapper != null) {
-          wrapper.failedPings = 0;
-          if (wrapper.standard) {
-            addToAlive(wrapper);
-          }
-        } else {
-          // something else already moved the server from zombie to alive
-        }
+        reviveZombieServer(zombieServer);
       }
     } catch (Exception e) {
       // Expected. The server is still down.
@@ -696,34 +677,49 @@ public abstract class LBSolrClient extends SolrClient {
 
       // If the server doesn't belong in the standard set belonging to this 
load balancer
       // then simply drop it after a certain number of failed pings.
-      if (!zombieServer.standard && zombieServer.failedPings >= 
NONSTANDARD_PING_LIMIT) {
-        zombieServers.remove(zombieServer.getBaseUrl());
+      if (!allServers.containsKey(zombieServer.getBaseUrl())
+          && zombieServer.failedPings >= NONSTANDARD_PING_LIMIT) {
+        reviveZombieServer(zombieServer);
       }
     }
   }
 
-  private ServerWrapper removeFromAlive(String key) {
-    synchronized (aliveServers) {
-      ServerWrapper wrapper = aliveServers.remove(key);
-      if (wrapper != null) updateAliveList();
-      return wrapper;
+  protected synchronized void reviveZombieServer(String endpoint) {
+    ServerWrapper server = zombieServers.remove(endpoint);
+    if (server != null && allServers.containsKey(endpoint)) {
+      server.failedPings = 0;
+      updateAliveList();
     }
   }
 
-  private void addToAlive(ServerWrapper wrapper) {
-    synchronized (aliveServers) {
-      ServerWrapper prev = aliveServers.put(wrapper.getBaseUrl(), wrapper);
-      // TODO: warn if there was a previous entry?
+  protected synchronized void reviveZombieServer(ServerWrapper wrapper) {
+    reviveZombieServer(wrapper.getBaseUrl());
+  }
+
+  protected synchronized void makeServerAZombie(ServerWrapper wrapper) {
+    if (zombieServers.putIfAbsent(wrapper.getBaseUrl(), wrapper) == null) {
       updateAliveList();
     }
+    startAliveCheckExecutor();
+  }
+
+  protected void makeServerAZombie(String endpoint) {
+    makeServerAZombie(createServerWrapper(endpoint));
+  }
+
+  protected Exception makeServerAZombie(String endpoint, Exception e) {
+    makeServerAZombie(endpoint);
+    return e;
   }
 
   /**
    * @deprecated use {@link #addSolrServer(Endpoint)} instead
    */
   @Deprecated
-  public void addSolrServer(String server) throws MalformedURLException {
-    addToAlive(createServerWrapper(server));
+  public synchronized void addSolrServer(String server) throws 
MalformedURLException {
+    ServerWrapper prev = allServers.put(server, createServerWrapper(server));
+    // TODO: warn if there was a previous entry?
+    updateAliveList();
   }
 
   public void addSolrServer(Endpoint server) throws MalformedURLException {
@@ -734,7 +730,7 @@ public abstract class LBSolrClient extends SolrClient {
    * @deprecated use {@link #removeSolrEndpoint(Endpoint)} instead
    */
   @Deprecated
-  public String removeSolrServer(String server) {
+  public synchronized String removeSolrServer(String server) {
     server = URI.create(server).toString();
     if (server.endsWith("/")) {
       server = server.substring(0, server.length() - 1);
@@ -742,8 +738,9 @@ public abstract class LBSolrClient extends SolrClient {
 
     // there is a small race condition here - if the server is in the process 
of being moved between
     // lists, we could fail to remove it.
-    removeFromAlive(server);
+    allServers.remove(server);
     zombieServers.remove(server);
+    updateAliveList();
     return null;
   }
 
@@ -798,12 +795,21 @@ public abstract class LBSolrClient extends SolrClient {
       } catch (SolrServerException e) {
         if (e.getRootCause() instanceof IOException) {
           ex = e;
-          moveAliveToDead(wrapper);
-          if (justFailed == null) justFailed = new HashMap<>();
+          makeServerAZombie(wrapper);
+          if (justFailed == null) {
+            justFailed = new HashMap<>();
+          }
           justFailed.put(wrapper.getBaseUrl(), wrapper);
         } else {
           throw e;
         }
+      } catch (IOException e) {
+        ex = e;
+        makeServerAZombie(wrapper);
+        if (justFailed == null) {
+          justFailed = new HashMap<>();
+        }
+        justFailed.put(wrapper.getBaseUrl(), wrapper);
       } catch (Exception e) {
         throw new SolrServerException(e);
       }
@@ -816,18 +822,18 @@ public abstract class LBSolrClient extends SolrClient {
         break;
       }
 
-      if (wrapper.standard == false
+      // Do not try this server if it was just tried, or if it is not in this 
client's list of
+      // servers
+      if (!allServers.containsKey(wrapper.getBaseUrl())
           || (justFailed != null && 
justFailed.containsKey(wrapper.getBaseUrl()))) continue;
       try {
         ++numServersTried;
         final var rsp = doRequest(wrapper.baseUrl, request, collection);
-        // remove from zombie list *before* adding the alive list to avoid a 
race that could lose a
-        // server
-        zombieServers.remove(wrapper.getBaseUrl());
-        addToAlive(wrapper);
+        reviveZombieServer(wrapper);
         return rsp;
       } catch (SolrException e) {
         // Server is alive but the request was malformed or invalid
+        reviveZombieServer(wrapper);
         throw e;
       } catch (SolrServerException e) {
         if (e.getRootCause() instanceof IOException) {
@@ -876,13 +882,6 @@ public abstract class LBSolrClient extends SolrClient {
     return aliveServerList[count % aliveServerList.length];
   }
 
-  private void moveAliveToDead(ServerWrapper wrapper) {
-    wrapper = removeFromAlive(wrapper.getBaseUrl());
-    if (wrapper == null) return; // another thread already detected the 
failure and removed it
-    zombieServers.put(wrapper.getBaseUrl(), wrapper);
-    startAliveCheckExecutor();
-  }
-
   @Override
   public void close() {
     synchronized (this) {
diff --git 
a/solr/solrj/src/test/org/apache/solr/client/solrj/TestLBHttp2SolrClient.java 
b/solr/solrj/src/test/org/apache/solr/client/solrj/TestLBHttp2SolrClient.java
index 3ab44ca13d4..590255a6dd6 100644
--- 
a/solr/solrj/src/test/org/apache/solr/client/solrj/TestLBHttp2SolrClient.java
+++ 
b/solr/solrj/src/test/org/apache/solr/client/solrj/TestLBHttp2SolrClient.java
@@ -37,6 +37,7 @@ import org.apache.solr.client.solrj.response.SolrResponseBase;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.embedded.JettyConfig;
 import org.apache.solr.embedded.JettySolrRunner;
+import org.apache.solr.util.LogLevel;
 import org.apache.solr.util.LogListener;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -48,6 +49,7 @@ import org.slf4j.LoggerFactory;
  *
  * @since solr 1.4
  */
+@LogLevel("org.apache.solr.client.solrj.impl=DEBUG")
 public class TestLBHttp2SolrClient extends SolrTestCaseJ4 {
 
   private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

Reply via email to