This is an automated email from the ASF dual-hosted git repository.
gerlowskija pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new eb96873c174 SOLR-17692: Abort ongoing fetches on core close (#3292)
eb96873c174 is described below
commit eb96873c174e172a81d0e69cd9d28cc3dc1558c2
Author: Jason Gerlowski <[email protected]>
AuthorDate: Wed Apr 2 08:55:32 2025 -0400
SOLR-17692: Abort ongoing fetches on core close (#3292)
RecoveryStrategy.close aims to stop replication when the surrounding
core is closed, but doesn't quite manage in all cases. In particular,
the 'closed' flag isn't able to preempt replication once the
IndexFetcher has started pulling files.
This commit aims to fix this by having RecoveryStrategy.close invoke
ReplicationHandler.abortFetch, which sets a flag that *is* noticed by
IndexFetcher. This should ensure that DELETEREPLICA calls and other
core-shutdown paths don't block on long-running recovery operations.
---
solr/CHANGES.txt | 3 +++
.../org/apache/solr/cloud/RecoveryStrategy.java | 23 ++++++++++++++++++----
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index c62fc85244d..49b1cabd6c8 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -238,6 +238,9 @@ Bug Fixes
* SOLR-17709: Fix race condition when checking distrib async cmd status
(Houston Putman)
+* SOLR-17692: Core unload/deletion now preempts all forms of ongoing
"recovery", rather than inadvertently waiting for
+ completion in some cases. (Jason Gerlowski)
+
Dependency Upgrades
---------------------
* SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 3ea1883af66..164f10d0db0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -106,6 +106,7 @@ public class RecoveryStrategy implements Runnable,
Closeable {
Integer.getInteger("solr.cloud.wait-for-updates-with-stale-state-pause",
2500);
private int maxRetries = 500;
private int startingRecoveryDelayMilliSeconds = 2000;
+ private ReplicationHandler replicationHandlerDoingFetch;
public static interface RecoveryListener {
public void recovered();
@@ -188,6 +189,15 @@ public class RecoveryStrategy implements Runnable,
Closeable {
close = true;
cancelPrepRecoveryCmd();
log.warn("Stopping recovery for core=[{}] coreNodeName=[{}]", coreName,
coreZkNodeName);
+ abortIndexFetchingIfNecessary(replicationHandlerDoingFetch);
+ }
+
+ private void abortIndexFetchingIfNecessary(ReplicationHandler fetcher) {
+ // a 'null' ReplicationHandler indicates that no
full-recovery/index-fetching is ongoing to
+ // abort.
+ if (fetcher != null) {
+ fetcher.abortFetch();
+ }
}
private final void recoveryFailed(final ZkController zkController, final
CoreDescriptor cd)
@@ -240,10 +250,15 @@ public class RecoveryStrategy implements Runnable,
Closeable {
ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, replicaType ==
Replica.Type.TLOG);
if (isClosed()) return; // we check closed on return
- boolean success = replicationHandler.doFetch(solrParams,
false).getSuccessful();
-
- if (!success) {
- throw new SolrException(ErrorCode.SERVER_ERROR, "Replication for
recovery failed.");
+ try {
+ // Stash the RH so the fetch can be aborted if RecoveryStrategy is
closed mid-fetch
+ replicationHandlerDoingFetch = replicationHandler;
+ boolean success = replicationHandler.doFetch(solrParams,
false).getSuccessful();
+ if (!success) {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Replication for
recovery failed.");
+ }
+ } finally {
+ replicationHandlerDoingFetch = null;
}
// solrcloud_debug