This is an automated email from the ASF dual-hosted git repository.
DaanHoogland pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudstack.git
The following commit(s) were added to refs/heads/main by this push:
new 850b44317a4 fix(linstor): verify resource deletion completes; warn if
stuck in DELETING (#13076)
850b44317a4 is described below
commit 850b44317a4c7e4f57b9d03f6890fd143bbd63a8
Author: James Peru Mmbono <[email protected]>
AuthorDate: Thu May 21 17:57:33 2026 +0300
fix(linstor): verify resource deletion completes; warn if stuck in DELETING
(#13076)
Co-authored-by: jmsperu <[email protected]>
---
.../kvm/storage/LinstorStorageAdaptor.java | 11 +++++
.../driver/LinstorPrimaryDataStoreDriverImpl.java | 14 ++++++
.../storage/datastore/util/LinstorUtil.java | 52 ++++++++++++++++++++++
3 files changed, 77 insertions(+)
diff --git
a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
index 77953a32e63..31a41cd9407 100644
---
a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
+++
b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
@@ -514,6 +514,17 @@ public class LinstorStorageAdaptor implements
StorageAdaptor {
ApiCallRcList answers =
api.resourceDefinitionDelete(rd.getName());
checkLinstorAnswersThrow(answers);
deleted = true;
+
+ // LINSTOR can return success here while the resource lingers
in DELETING state
+ // on the controller (down peer, lost quorum, etc.). Confirm
it's actually gone
+ // — if not, log a WARN so operators can clear it manually.
Don't throw: the
+ // CloudStack-side accounting has already moved on.
+ if (!LinstorUtil.waitForResourceDefinitionDeleted(api,
rd.getName(),
+ LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) {
+ logger.warn("Linstor: resource {} still present {}ms after
delete returned success — " +
+ "may be stuck in DELETING. Check the LINSTOR
controller (linstor resource list).",
+ rd.getName(),
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
+ }
}
}
return deleted;
diff --git
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
index 83dacf74e8d..d7451fab18f 100644
---
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
+++
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
@@ -232,6 +232,20 @@ public class LinstorPrimaryDataStoreDriverImpl implements
PrimaryDataStoreDriver
throw new CloudRuntimeException("Linstor: Unable to delete
resource definition: " + rscDefName);
}
logger.info("Linstor: Deleted resource {}", rscDefName);
+
+ // LINSTOR can return success on the delete API call while the
resource lingers in
+ // DELETING state (peer issues, lost quorum, satellite down).
Verify the resource is
+ // actually gone — if not, log a WARN so operators see it. We
deliberately do NOT
+ // throw here: the volume is already considered gone on the
CloudStack side, and
+ // throwing would leave the CS DB and LINSTOR in different states.
+ if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi,
rscDefName,
+ LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS))
+ {
+ logger.warn("Linstor: resource {} still present {}ms after
delete returned success — " +
+ "may be stuck in DELETING. Check the LINSTOR
controller (linstor resource list) " +
+ "and clear manually if the resource has no live
peers.",
+ rscDefName,
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
+ }
} catch (ApiException apiEx)
{
logger.error("Linstor: ApiEx - " + apiEx.getMessage());
diff --git
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
index 7c45493dddc..239331077e1 100644
---
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
+++
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
@@ -401,6 +401,58 @@ public class LinstorUtil {
.collect(Collectors.toList());
}
+ /**
+ * Default per-call timeout for {@link #waitForResourceDefinitionDeleted}.
Long enough for a
+ * healthy LINSTOR controller to finish a normal delete; short enough not
to block the calling
+ * thread for too long if the delete is genuinely stuck. Used both from
the management server
+ * (e.g. {@code LinstorPrimaryDataStoreDriverImpl}) and from KVM agent
paths.
+ */
+ public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L;
+
+ /**
+ * Returns {@code true} if the named resource definition is no longer
present on the LINSTOR
+ * controller. Used after a {@code resourceDefinitionDelete} to verify the
delete actually
+ * completed (LINSTOR can return success on the API call while the
resource lingers in
+ * DELETING state due to peer issues, lost quorum, or down satellites).
Uses the
+ * controller-side name filter rather than scanning every RD on the
cluster (cheap even
+ * when polled once per second from {@link
#waitForResourceDefinitionDeleted}).
+ */
+ public static boolean isResourceDefinitionGone(DevelopersApi api, String
rscName) throws ApiException {
+ List<ResourceDefinition> matching =
+ api.resourceDefinitionList(Collections.singletonList(rscName),
false, null, null, null);
+ return matching == null || matching.isEmpty();
+ }
+
+ /**
+ * Polls the controller until the named resource definition is gone or the
timeout elapses.
+ * Returns {@code true} if the resource was confirmed gone, {@code false}
if it was still
+ * present (or the controller kept erroring) at the deadline. Callers
should NOT throw on a
+ * {@code false} return — the upstream API call already reported success
and the operator
+ * may need to investigate manually. Log a WARN with the resource name
instead.
+ */
+ public static boolean waitForResourceDefinitionDeleted(DevelopersApi api,
String rscName, long timeoutMillis) {
+ final long deadline = System.currentTimeMillis() + timeoutMillis;
+ while (true) {
+ try {
+ if (isResourceDefinitionGone(api, rscName)) {
+ return true;
+ }
+ } catch (ApiException e) {
+ LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}",
rscName, e.getMessage());
+ // Keep polling — controller may be transiently unavailable.
+ }
+ if (System.currentTimeMillis() >= deadline) {
+ return false;
+ }
+ try {
+ Thread.sleep(1_000L);
+ } catch (InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ return false;
+ }
+ }
+ }
+
/**
* Returns a pair list of resource-definitions with ther 1:1 mapped
resource-group objects that start with the
* resource name `startWith`