This is an automated email from the ASF dual-hosted git repository.

DaanHoogland pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudstack.git


The following commit(s) were added to refs/heads/main by this push:
     new 850b44317a4 fix(linstor): verify resource deletion completes; warn if 
stuck in DELETING (#13076)
850b44317a4 is described below

commit 850b44317a4c7e4f57b9d03f6890fd143bbd63a8
Author: James Peru Mmbono <[email protected]>
AuthorDate: Thu May 21 17:57:33 2026 +0300

    fix(linstor): verify resource deletion completes; warn if stuck in DELETING 
(#13076)
    
    Co-authored-by: jmsperu <[email protected]>
---
 .../kvm/storage/LinstorStorageAdaptor.java         | 11 +++++
 .../driver/LinstorPrimaryDataStoreDriverImpl.java  | 14 ++++++
 .../storage/datastore/util/LinstorUtil.java        | 52 ++++++++++++++++++++++
 3 files changed, 77 insertions(+)

diff --git 
a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
 
b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
index 77953a32e63..31a41cd9407 100644
--- 
a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
+++ 
b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java
@@ -514,6 +514,17 @@ public class LinstorStorageAdaptor implements 
StorageAdaptor {
                 ApiCallRcList answers = 
api.resourceDefinitionDelete(rd.getName());
                 checkLinstorAnswersThrow(answers);
                 deleted = true;
+
+                // LINSTOR can return success here while the resource lingers 
in DELETING state
+                // on the controller (down peer, lost quorum, etc.). Confirm 
it's actually gone
+                // — if not, log a WARN so operators can clear it manually. 
Don't throw: the
+                // CloudStack-side accounting has already moved on.
+                if (!LinstorUtil.waitForResourceDefinitionDeleted(api, 
rd.getName(),
+                        LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) {
+                    logger.warn("Linstor: resource {} still present {}ms after 
delete returned success — " +
+                            "may be stuck in DELETING. Check the LINSTOR 
controller (linstor resource list).",
+                            rd.getName(), 
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
+                }
             }
         }
         return deleted;
diff --git 
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
 
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
index 83dacf74e8d..d7451fab18f 100644
--- 
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
+++ 
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java
@@ -232,6 +232,20 @@ public class LinstorPrimaryDataStoreDriverImpl implements 
PrimaryDataStoreDriver
                 throw new CloudRuntimeException("Linstor: Unable to delete 
resource definition: " + rscDefName);
             }
             logger.info("Linstor: Deleted resource {}", rscDefName);
+
+            // LINSTOR can return success on the delete API call while the 
resource lingers in
+            // DELETING state (peer issues, lost quorum, satellite down). 
Verify the resource is
+            // actually gone — if not, log a WARN so operators see it. We 
deliberately do NOT
+            // throw here: the volume is already considered gone on the 
CloudStack side, and
+            // throwing would leave the CS DB and LINSTOR in different states.
+            if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, 
rscDefName,
+                    LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS))
+            {
+                logger.warn("Linstor: resource {} still present {}ms after 
delete returned success — " +
+                        "may be stuck in DELETING. Check the LINSTOR 
controller (linstor resource list) " +
+                        "and clear manually if the resource has no live 
peers.",
+                        rscDefName, 
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
+            }
         } catch (ApiException apiEx)
         {
             logger.error("Linstor: ApiEx - " + apiEx.getMessage());
diff --git 
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
 
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
index 7c45493dddc..239331077e1 100644
--- 
a/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
+++ 
b/plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java
@@ -401,6 +401,58 @@ public class LinstorUtil {
                 .collect(Collectors.toList());
     }
 
+    /**
+     * Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. 
Long enough for a
+     * healthy LINSTOR controller to finish a normal delete; short enough not 
to block the calling
+     * thread for too long if the delete is genuinely stuck. Used both from 
the management server
+     * (e.g. {@code LinstorPrimaryDataStoreDriverImpl}) and from KVM agent 
paths.
+     */
+    public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L;
+
+    /**
+     * Returns {@code true} if the named resource definition is no longer 
present on the LINSTOR
+     * controller. Used after a {@code resourceDefinitionDelete} to verify the 
delete actually
+     * completed (LINSTOR can return success on the API call while the 
resource lingers in
+     * DELETING state due to peer issues, lost quorum, or down satellites). 
Uses the
+     * controller-side name filter rather than scanning every RD on the 
cluster (cheap even
+     * when polled once per second from {@link 
#waitForResourceDefinitionDeleted}).
+     */
+    public static boolean isResourceDefinitionGone(DevelopersApi api, String 
rscName) throws ApiException {
+        List<ResourceDefinition> matching =
+                api.resourceDefinitionList(Collections.singletonList(rscName), 
false, null, null, null);
+        return matching == null || matching.isEmpty();
+    }
+
+    /**
+     * Polls the controller until the named resource definition is gone or the 
timeout elapses.
+     * Returns {@code true} if the resource was confirmed gone, {@code false} 
if it was still
+     * present (or the controller kept erroring) at the deadline. Callers 
should NOT throw on a
+     * {@code false} return — the upstream API call already reported success 
and the operator
+     * may need to investigate manually. Log a WARN with the resource name 
instead.
+     */
+    public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, 
String rscName, long timeoutMillis) {
+        final long deadline = System.currentTimeMillis() + timeoutMillis;
+        while (true) {
+            try {
+                if (isResourceDefinitionGone(api, rscName)) {
+                    return true;
+                }
+            } catch (ApiException e) {
+                LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", 
rscName, e.getMessage());
+                // Keep polling — controller may be transiently unavailable.
+            }
+            if (System.currentTimeMillis() >= deadline) {
+                return false;
+            }
+            try {
+                Thread.sleep(1_000L);
+            } catch (InterruptedException ie) {
+                Thread.currentThread().interrupt();
+                return false;
+            }
+        }
+    }
+
     /**
      * Returns a pair list of resource-definitions with ther 1:1 mapped 
resource-group objects that start with the
      * resource name `startWith`

Reply via email to