This is an automated email from the ASF dual-hosted git repository.
houston pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new e51dd47d884 SOLR-17709: Fix race condition when checking distrib async
cmd status (#3268)
e51dd47d884 is described below
commit e51dd47d88445259d57fc63dc655aecaafecf265
Author: Houston Putman <[email protected]>
AuthorDate: Wed Mar 19 14:16:32 2025 -0500
SOLR-17709: Fix race condition when checking distrib async cmd status
(#3268)
(cherry picked from commit d0d4f280b6410d8996fa998620d9b6661848d1f0)
---
solr/CHANGES.txt | 2 ++
.../solr/cloud/DistributedApiAsyncTracker.java | 28 +++++++++++++++-------
2 files changed, 22 insertions(+), 8 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 942196f4982..239325e1a96 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -59,6 +59,8 @@ Bug Fixes
* SOLR-17699: Fixed performance regression since 9.0 with "frange" queries
embedded in other queries.
May also affect some numeric range queries when the minimum is negative.
(David Smiley)
+* SOLR-17709: Fix race condition when checking distrib async cmd status
(Houston Putman)
+
Dependency Upgrades
---------------------
* SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)
diff --git
a/solr/core/src/java/org/apache/solr/cloud/DistributedApiAsyncTracker.java
b/solr/core/src/java/org/apache/solr/cloud/DistributedApiAsyncTracker.java
index 8ebdb2ae287..c41244aace9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DistributedApiAsyncTracker.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DistributedApiAsyncTracker.java
@@ -221,6 +221,26 @@ public class DistributedApiAsyncTracker {
return new Pair<>(RequestStatusState.NOT_FOUND, null);
}
+ /*
+ * First check the inFlightAsyncTasks, then the trackedAsyncTasks.
+ * This is the reverse of "cancelAsyncId()" and "setTaskCompleted()",
+ * because a race condition can occur where this is called between the
+ * "trackedAsyncTasks" and "inFlightAsyncTasks" modifications in those
methods.
+ *
+ * Therefore, here we will start with inFlightAsyncTasks, and if it hasn't
been updated yet,
+ * we return the "old-ish" state, and the client can try again. If the
inFlightAsyncTasks has
+ * been updated (removed), then we know to try the trackedAsyncTasks which
must have been updated
+ * at that point.
+ */
+
+ // Now dealing with the middle column "persistent=null or failed
OverseerSolrResponse"
+ InFlightJobs.State ephemeralState =
inFlightAsyncTasks.getInFlightState(asyncId);
+ if (ephemeralState == InFlightJobs.State.SUBMITTED) {
+ return new Pair<>(RequestStatusState.SUBMITTED, null);
+ } else if (ephemeralState == InFlightJobs.State.RUNNING) {
+ return new Pair<>(RequestStatusState.RUNNING, null);
+ }
+
byte[] data = trackedAsyncTasks.get(asyncId);
OverseerSolrResponse response =
data != null ? OverseerSolrResponseSerializer.deserialize(data) : null;
@@ -233,14 +253,6 @@ public class DistributedApiAsyncTracker {
return new Pair<>(RequestStatusState.COMPLETED, response);
}
- // Now dealing with the middle column "persistent=null or failed
OverseerSolrResponse"
- InFlightJobs.State ephemeralState =
inFlightAsyncTasks.getInFlightState(asyncId);
- if (ephemeralState == InFlightJobs.State.SUBMITTED) {
- return new Pair<>(RequestStatusState.SUBMITTED, null);
- } else if (ephemeralState == InFlightJobs.State.RUNNING) {
- return new Pair<>(RequestStatusState.RUNNING, null);
- }
-
// The task has failed, but there are two options: if response is null, it
has failed because
// the node on which it was running has crashed. If it is not null, it has
failed because the
// execution has failed. Because caller expects a non null response in any
case, let's make up