This is an automated email from the ASF dual-hosted git repository.
dsmiley pushed a commit to branch branch_10x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_10x by this push:
new b6983892ba3 SOLR-18002: idle timeouts should cause servers to be added
to the zombie list (#3891)
b6983892ba3 is described below
commit b6983892ba3b6d129a766f07b5377c24b523de56
Author: jvanneman <[email protected]>
AuthorDate: Sun Jan 25 10:05:32 2026 -0500
SOLR-18002: idle timeouts should cause servers to be added to the zombie
list (#3891)
CloudSolrClient/LBSolrClient should consider a retry-able request that
times out as another condition to internally mark that replica as a "zombie".
Previously, unresponsive servers continued to receive traffic and high
client latencies as the idle timeout is consistently triggered on every request
to that replica.
---
...002-add-unresponsive-servers-to-zombie-list.yml | 8 +
.../pages/configuring-solr-xml.adoc | 5 +-
.../query-guide/pages/common-query-parameters.adoc | 3 +-
.../solr/client/solrj/impl/LBAsyncSolrClient.java | 16 +-
.../solr/client/solrj/impl/LBSolrClient.java | 16 +-
.../solr/client/solrj/impl/LB2SolrClientTest.java | 188 +++++++++++++++++++++
6 files changed, 230 insertions(+), 6 deletions(-)
diff --git
a/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml
b/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml
new file mode 100644
index 00000000000..b658a667c37
--- /dev/null
+++
b/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml
@@ -0,0 +1,8 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: CloudSolrClient/LBSolrClient should consider a retry-able request that
times out as another condition to internally mark that replica as a "zombie".
+type: changed # added, changed, fixed, deprecated, removed, dependency_update,
security, other
+authors:
+ - name: James Vanneman
+links:
+ - name: SOLR-18002
+ url: https://issues.apache.org/jira/browse/SOLR-18002
diff --git
a/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
b/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
index a1032f5ecf7..2aed51b89e3 100644
---
a/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
+++
b/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
@@ -522,7 +522,7 @@ Custom shard handlers are also supported and should be
referenced in `solr.xml`
Sub-elements of `<shardHandlerFactory>` may vary in the case of custom shard
handlers, but both `HttpShardHandlerFactory` and `ParallelShardHandlerFactory`
support the following configuration options:
-`socketTimeout`::
+[[sockettimeout]]`socketTimeout`::
+
[%autowidth,frame=none]
|===
@@ -531,6 +531,9 @@ Sub-elements of `<shardHandlerFactory>` may vary in the
case of custom shard han
+
The read timeout for intra-cluster query and administrative requests.
The default is the same as the `distribUpdateSoTimeout` specified in the
`<solrcloud>` section.
++
+It is recommended to set this value to be larger than any
xref:query-guide:common-query-parameters.adoc#timeallowed-parameter[`timeAllowed`]
query parameter used to allow
xref:query-guide:common-query-parameters.adoc#timeallowed-parameter[`timeAllowed`]
to gracefully finish the request processing and return partial results before
the coordinator gives up on the request.
+
`connTimeout`::
+
diff --git
a/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
b/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
index 49c418fd749..4e69ef4855f 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
@@ -350,7 +350,8 @@ This parameter specifies the amount of time, in
milliseconds, allowed for a sear
If this time expires before the search is complete, any partial results will
be returned, but values such as `numFound`, xref:faceting.adoc[facet] counts,
and result xref:stats-component.adoc[stats] may not be accurate for the entire
result set.
In case of expiration, if `omitHeader` isn't set to `true` the response header
contains a special flag called `partialResults`.
When using `timeAllowed` in combination with
xref:pagination-of-results.adoc#using-cursors[`cursorMark`], and the
`partialResults` flag is present, some matching documents may have been skipped
in the result set.
-Additionally, if the `partialResults` flag is present, `cursorMark` can match
`nextCursorMark` even if there may be more results
+Additionally, if the `partialResults` flag is present, `cursorMark` can match
`nextCursorMark` even if there may be more results.
+It is recommended to set this value to be smaller than the
xref:configuration-guide:configuring-solr-xml.adoc#sockettimeout[socketTimeout]
configured in `solr.xml` to allow Solr to gracefully finish the request
processing and return partial results before the coordinator gives up on the
request.
[source,json]
----
diff --git
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
index c02197c2d4d..48dab986d80 100644
---
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
+++
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
@@ -21,6 +21,8 @@ import java.net.ConnectException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.solr.client.solrj.RemoteSolrException;
import org.apache.solr.client.solrj.SolrClient;
@@ -179,6 +181,9 @@ public abstract class LBAsyncSolrClient extends
LBSolrClient {
boolean isNonRetryable,
boolean isZombie,
RetryListener listener) {
+ if (oe instanceof CompletionException) {
+ oe = oe.getCause();
+ }
try {
throw (Exception) oe;
} catch (SolrException e) {
@@ -210,9 +215,16 @@ public abstract class LBAsyncSolrClient extends
LBSolrClient {
}
} catch (SolrServerException e) {
Throwable rootCause = e.getRootCause();
- if (!isNonRetryable && rootCause instanceof IOException) {
+ if (!isNonRetryable
+ && (rootCause instanceof IOException || rootCause instanceof
TimeoutException)) {
+ listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e,
true);
+ } else if (isNonRetryable && isConnectException(rootCause)) {
listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e,
true);
- } else if (isNonRetryable && rootCause instanceof ConnectException) {
+ } else {
+ listener.onFailure(e, false);
+ }
+ } catch (IOException e) {
+ if (!isNonRetryable || isConnectException(e)) {
listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e,
true);
} else {
listener.onFailure(e, false);
diff --git
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
index fc0d06c7f29..74e55e9629d 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
@@ -23,6 +23,7 @@ import java.lang.ref.WeakReference;
import java.net.ConnectException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
+import java.net.http.HttpConnectTimeoutException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -38,6 +39,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.solr.client.solrj.RemoteSolrException;
@@ -665,9 +667,10 @@ public abstract class LBSolrClient extends SolrClient {
}
} catch (SolrServerException e) {
Throwable rootCause = e.getRootCause();
- if (!isNonRetryable && rootCause instanceof IOException) {
+ if (!isNonRetryable
+ && (rootCause instanceof IOException || rootCause instanceof
TimeoutException)) {
ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
- } else if (isNonRetryable && rootCause instanceof ConnectException) {
+ } else if (isNonRetryable && isConnectException(rootCause)) {
ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
} else {
throw e;
@@ -679,6 +682,15 @@ public abstract class LBSolrClient extends SolrClient {
return ex;
}
+ protected boolean isConnectException(Throwable t) {
+ if (t instanceof ConnectException || t instanceof
HttpConnectTimeoutException) {
+ return true;
+ }
+ // Check for common connection timeout exceptions by name to avoid hard
dependencies on
+ // specific HTTP client libraries (e.g., Jetty or Apache HttpClient).
+ return t != null &&
t.getClass().getName().endsWith("ConnectTimeoutException");
+ }
+
protected abstract SolrClient getClient(Endpoint endpoint);
private void startAliveCheckExecutor() {
diff --git
a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
index 79c24dfd62d..66e806e9538 100644
---
a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
+++
b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
@@ -18,6 +18,8 @@ package org.apache.solr.client.solrj.impl;
import java.io.IOException;
import java.io.UncheckedIOException;
+import java.net.ServerSocket;
+import java.net.Socket;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
@@ -25,16 +27,22 @@ import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
+import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.jetty.HttpJettySolrClient;
+import org.apache.solr.client.solrj.jetty.LBJettySolrClient;
+import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.SolrQuery;
+import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.SolrResponseBase;
import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.RetryUtil;
import org.apache.solr.embedded.JettyConfig;
import org.apache.solr.embedded.JettySolrRunner;
@@ -204,6 +212,62 @@ public class LB2SolrClientTest extends SolrTestCaseJ4 {
}
}
+ public void testTimeoutExceptionMarksServerAsZombie() throws Exception {
+ try (TimeoutZombieTestContext ctx = new TimeoutZombieTestContext()) {
+ LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+ try {
+ ctx.lbClient.request(lbReq);
+ } catch (Exception e) {
+ }
+
+ ctx.assertZombieState();
+ }
+ }
+
+ public void testTimeoutExceptionMarksServerAsZombieAsyncRequest() throws
Exception {
+ try (TimeoutZombieTestContext ctx = new TimeoutZombieTestContext()) {
+ LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+ ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get();
+
+ ctx.assertZombieState();
+ }
+ }
+
+ public void testConnectTimeoutExceptionMarksServerAsZombie() throws
Exception {
+ try (ConnectTimeoutZombieTestContext ctx = new
ConnectTimeoutZombieTestContext()) {
+ LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+ try {
+ ctx.lbClient.request(lbReq);
+ } catch (Exception e) {
+ }
+
+ ctx.assertZombieState();
+ }
+ }
+
+ public void testConnectTimeoutExceptionMarksServerAsZombieAsyncRequest()
throws Exception {
+ try (ConnectTimeoutZombieTestContext ctx = new
ConnectTimeoutZombieTestContext()) {
+ LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+ ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get();
+
+ ctx.assertZombieState();
+ }
+ }
+
+ public void testConnectTimeoutExceptionMarksServerAsZombieAsyncUpdate()
throws Exception {
+ try (ConnectTimeoutZombieTestContext ctx = new
ConnectTimeoutZombieTestContext()) {
+ LBSolrClient.Req lbReq = ctx.createUpdateRequest();
+
+ ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get();
+
+ ctx.assertZombieState();
+ }
+ }
+
private LBSolrClient.Endpoint[] bootstrapBaseSolrEndpoints(int max) {
LBSolrClient.Endpoint[] solrUrls = new LBSolrClient.Endpoint[max];
for (int i = 0; i < max; i++) {
@@ -318,4 +382,128 @@ public class LB2SolrClientTest extends SolrTestCaseJ4 {
}
}
}
+
+ private class TimeoutZombieTestContext implements AutoCloseable {
+ final ServerSocket blackhole;
+ final LBSolrClient.Endpoint nonRoutableEndpoint;
+ final HttpJettySolrClient delegateClient;
+ final LBAsyncSolrClient lbClient;
+
+ TimeoutZombieTestContext() throws Exception {
+ // create a socket that allows a client to connect but causes them to
hang until idleTimeout
+ // is triggered
+ blackhole = new ServerSocket(0);
+ int blackholePort = blackhole.getLocalPort();
+ nonRoutableEndpoint =
+ new LBSolrClient.Endpoint("http://localhost:" + blackholePort +
"/solr");
+
+ delegateClient =
+ new HttpJettySolrClient.Builder()
+ .withConnectionTimeout(1000, TimeUnit.MILLISECONDS)
+ .withIdleTimeout(1, TimeUnit.MILLISECONDS)
+ .build();
+
+ lbClient = new LBJettySolrClient.Builder(delegateClient,
nonRoutableEndpoint).build();
+ }
+
+ LBSolrClient.Req createQueryRequest() {
+ SolrQuery solrQuery = new SolrQuery("*:*");
+ QueryRequest queryRequest = new QueryRequest(solrQuery);
+
+ List<LBSolrClient.Endpoint> endpoints =
+ List.of(
+ new LBSolrClient.Endpoint(
+ nonRoutableEndpoint.getBaseUrl(),
solr[0].getDefaultCollection()));
+ return new LBSolrClient.Req(queryRequest, endpoints);
+ }
+
+ void assertZombieState() {
+ assertTrue(
+ "Non-routable endpoint should be marked as zombie due to timeout",
+ lbClient.zombieServers.containsKey(
+ nonRoutableEndpoint.getBaseUrl() + "/" +
solr[0].getDefaultCollection()));
+ }
+
+ @Override
+ public void close() {
+ lbClient.close();
+ delegateClient.close();
+ try {
+ blackhole.close();
+ } catch (IOException ioe) {
+
+ }
+ }
+ }
+
+ private class ConnectTimeoutZombieTestContext implements AutoCloseable {
+ final ServerSocket ss;
+ final Socket connector;
+ final LBSolrClient.Endpoint nonRoutableEndpoint;
+ final LBAsyncSolrClient lbClient;
+ final HttpJdkSolrClient delegateClient;
+
+ ConnectTimeoutZombieTestContext() throws Exception {
+ // Create a server socket with a backlog of 1 and occupy that slot to
trigger a connect
+ // timeout.
+ ss = new ServerSocket(0, 1);
+ int port = ss.getLocalPort();
+ connector = new Socket("127.0.0.1", port);
+
+ nonRoutableEndpoint = new LBSolrClient.Endpoint("http://127.0.0.1:" +
port + "/solr");
+ delegateClient =
+ new HttpJdkSolrClient.Builder(nonRoutableEndpoint.getBaseUrl())
+ .withConnectionTimeout(1, TimeUnit.MILLISECONDS)
+ .build();
+
+ lbClient =
+ new LBAsyncSolrClient(
+ new LBSolrClient.Builder<>(delegateClient, nonRoutableEndpoint)
+ .withDefaultCollection(solr[0].getDefaultCollection())) {
+ @Override
+ protected CompletableFuture<NamedList<Object>> requestAsyncWithUrl(
+ SolrClient client, String baseUrl, SolrRequest<?> request)
+ throws SolrServerException, IOException {
+ return ((HttpJdkSolrClient) client).requestAsync(request, null);
+ }
+ };
+ }
+
+ LBSolrClient.Req createQueryRequest() {
+ SolrQuery solrQuery = new SolrQuery("*:*");
+ QueryRequest queryRequest = new QueryRequest(solrQuery);
+
+ List<LBSolrClient.Endpoint> endpoints =
+ List.of(
+ new LBSolrClient.Endpoint(
+ nonRoutableEndpoint.getBaseUrl(),
solr[0].getDefaultCollection()));
+ return new LBSolrClient.Req(queryRequest, endpoints);
+ }
+
+ LBSolrClient.Req createUpdateRequest() {
+ UpdateRequest updateRequest = new UpdateRequest();
+ updateRequest.add(new SolrInputDocument());
+
+ List<LBSolrClient.Endpoint> endpoints =
+ List.of(
+ new LBSolrClient.Endpoint(
+ nonRoutableEndpoint.getBaseUrl(),
solr[0].getDefaultCollection()));
+ return new LBSolrClient.Req(updateRequest, endpoints);
+ }
+
+ void assertZombieState() {
+ assertTrue(
+ "Endpoint should be marked as zombie due to connect timeout",
+ lbClient.zombieServers.containsKey(
+ nonRoutableEndpoint.getBaseUrl() + "/" +
solr[0].getDefaultCollection()));
+ }
+
+ @Override
+ public void close() throws IOException {
+ lbClient.close();
+ delegateClient.close();
+ connector.close();
+ ss.close();
+ }
+ }
}