This is an automated email from the ASF dual-hosted git repository.

deardeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 58a24298999 [fix](cloud) avoid false tablet diagnosis alarms in cloud 
mode (#60805)
58a24298999 is described below

commit 58a24298999b9f64f3b06cdf72b3f06d44c5c82d
Author: deardeng <[email protected]>
AuthorDate: Thu May 21 14:18:34 2026 +0800

    [fix](cloud) avoid false tablet diagnosis alarms in cloud mode (#60805)
    
    Problem
    
    SHOW TABLET DIAGNOSIS reports local-storage style errors in cloud
    (storage-compute separated) deployments. In cloud mode, replica version
    in FE is not a reliable per-backend data freshness signal, and backend
    disk pressure does not directly represent object-storage write
    availability.
    
    Root cause
    
    Diagnoser applied the same checks for both local and cloud modes:
    
    - ReplicaBackendStatus always evaluated be.diskExceedLimit()
    
    - ReplicaVersionStatus always required replica.version == partition
    visible version (and checked lastFailedVersion)
    
    These checks are valid for local replicas but can produce false
    positives in cloud mode.
    
    Change
    
    - Added cloud-mode guard via Config.isCloudMode() in Diagnoser.
    
    - In cloud mode, skip disk-exceed-limit check when building
    ReplicaBackendStatus.
    
    - In cloud mode, skip replica-version equality and last-failed-version
    checks when building ReplicaVersionStatus.
---
 .../java/org/apache/doris/system/Diagnoser.java    | 22 +++--
 .../doris/clone/TabletReplicaTooSlowTest.java      | 97 ++++++++++++++++++++++
 2 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java 
b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
index 46c6abbd18a..189080d2f8e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
@@ -26,6 +26,7 @@ import org.apache.doris.catalog.Replica;
 import org.apache.doris.catalog.Tablet;
 import org.apache.doris.catalog.TabletInvertedIndex;
 import org.apache.doris.catalog.TabletMeta;
+import org.apache.doris.common.Config;
 
 import com.google.common.collect.Lists;
 import org.json.simple.JSONObject;
@@ -114,6 +115,7 @@ public class Diagnoser {
         StringBuilder versionErr = new StringBuilder();
         StringBuilder statusErr = new StringBuilder();
         StringBuilder compactionErr = new StringBuilder();
+        boolean isCloudMode = Config.isCloudMode();
         // for local mode, getCachedVisibleVersion return visibleVersion.
         // for cloud mode, the replica version is not updated.
         long visibleVersion = partition.getCachedVisibleVersion();
@@ -143,20 +145,22 @@ public class Diagnoser {
                             + replica.getBackendIdWithoutException() + " is 
not query available. ");
                     break;
                 }
-                if (be.diskExceedLimit()) {
+                if (!isCloudMode && be.diskExceedLimit()) {
                     backendErr.append("Backend " + 
replica.getBackendIdWithoutException() + " has no space left. ");
                     break;
                 }
             } while (false);
             // version
-            if (replica.getVersion() != visibleVersion) {
-                versionErr.append("Replica on backend " + 
replica.getBackendIdWithoutException() + "'s version ("
-                        + replica.getVersion() + ") does not equal"
-                        + " to partition visible version (" + visibleVersion + 
")");
-            } else if (replica.getLastFailedVersion() != -1) {
-                versionErr.append("Replica on backend "
-                        + replica.getBackendIdWithoutException() + "'s last 
failed version is "
-                        + replica.getLastFailedVersion());
+            if (!isCloudMode) {
+                if (replica.getVersion() != visibleVersion) {
+                    versionErr.append("Replica on backend " + 
replica.getBackendIdWithoutException() + "'s version ("
+                            + replica.getVersion() + ") does not equal"
+                            + " to partition visible version (" + 
visibleVersion + ")");
+                } else if (replica.getLastFailedVersion() != -1) {
+                    versionErr.append("Replica on backend "
+                            + replica.getBackendIdWithoutException() + "'s 
last failed version is "
+                            + replica.getLastFailedVersion());
+                }
             }
             // status
             if (!replica.isAlive() || replica.isUserDrop()) {
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
index 7375d817354..400f5bc2bb2 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
@@ -17,10 +17,15 @@
 
 package org.apache.doris.clone;
 
+import org.apache.doris.catalog.Database;
 import org.apache.doris.catalog.DiskInfo;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.catalog.LocalTabletInvertedIndex;
+import org.apache.doris.catalog.MaterializedIndex;
+import org.apache.doris.catalog.OlapTable;
+import org.apache.doris.catalog.Partition;
 import org.apache.doris.catalog.Replica;
+import org.apache.doris.catalog.Tablet;
 import org.apache.doris.catalog.TabletInvertedIndex;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.ExceptionChecker;
@@ -163,6 +168,98 @@ public class TabletReplicaTooSlowTest {
         Assert.assertTrue(result.get(11).get(1).contains("version count is too 
high"));
     }
 
+    private static String getDiagnosisInfo(List<List<String>> rows, String 
item) {
+        for (List<String> row : rows) {
+            if (item.equals(row.get(0))) {
+                return row.get(1);
+            }
+        }
+        return "";
+    }
+
+    private static Map<String, TDisk> copyBackendDisks(Backend backend) {
+        Map<String, TDisk> disks = Maps.newHashMap();
+        for (DiskInfo diskInfo : backend.getDisks().values()) {
+            TDisk tDisk = new TDisk();
+            tDisk.setRootPath(diskInfo.getRootPath());
+            tDisk.setDiskTotalCapacity(diskInfo.getTotalCapacityB());
+            tDisk.setDataUsedCapacity(diskInfo.getDataUsedCapacityB());
+            tDisk.setTrashUsedCapacity(diskInfo.getTrashUsedCapacityB());
+            tDisk.setDiskAvailableCapacity(diskInfo.getAvailableCapacityB());
+            tDisk.setUsed(diskInfo.getState() == DiskInfo.DiskState.ONLINE);
+            tDisk.setPathHash(diskInfo.getPathHash());
+            tDisk.setStorageMedium(diskInfo.getStorageMedium());
+            disks.put(tDisk.getRootPath(), tDisk);
+        }
+        return disks;
+    }
+
+    private static Map<String, TDisk> buildExceedLimitDisks(Backend backend) {
+        Map<String, TDisk> disks = Maps.newHashMap();
+        for (DiskInfo diskInfo : backend.getDisks().values()) {
+            TDisk tDisk = new TDisk();
+            tDisk.setRootPath(diskInfo.getRootPath());
+            tDisk.setDiskTotalCapacity(1L);
+            tDisk.setDataUsedCapacity(1L);
+            tDisk.setTrashUsedCapacity(0L);
+            tDisk.setDiskAvailableCapacity(0L);
+            tDisk.setUsed(true);
+            tDisk.setPathHash(diskInfo.getPathHash());
+            tDisk.setStorageMedium(diskInfo.getStorageMedium());
+            disks.put(tDisk.getRootPath(), tDisk);
+        }
+        return disks;
+    }
+
+    @Test
+    public void testDiagnoseTabletCloudModeSkipDiskAndVersionCheck() throws 
Exception {
+        String tableName = "tbl_diag_cloud_" + Math.abs(random.nextInt());
+        String createStr = "create table test." + tableName + "\n"
+                + "(k1 date, k2 int)\n"
+                + "distributed by hash(k2) buckets 1\n"
+                + "properties\n"
+                + "(\n"
+                + "    \"replication_num\" = \"3\"\n"
+                + ")";
+        ExceptionChecker.expectThrowsNoException(() -> createTable(createStr));
+
+        Database db = Env.getCurrentInternalCatalog().getDbNullable("test");
+        Assert.assertNotNull(db);
+        OlapTable table = (OlapTable) db.getTableNullable(tableName);
+        Assert.assertNotNull(table);
+        Partition partition = table.getAllPartitions().iterator().next();
+        MaterializedIndex index = partition.getBaseIndex();
+        Tablet tablet = index.getTablets().get(0);
+        Replica replica = tablet.getReplicas().get(0);
+        long tabletId = tablet.getId();
+        long visibleVersion = partition.getCachedVisibleVersion();
+        Backend backend = 
Env.getCurrentSystemInfo().getBackend(replica.getBackendIdWithoutException());
+        Assert.assertNotNull(backend);
+
+        Map<String, TDisk> originalDisks = copyBackendDisks(backend);
+        String originCloudUniqueId = Config.cloud_unique_id;
+        long originalVersion = replica.getVersion();
+
+        try {
+            backend.updateDisks(buildExceedLimitDisks(backend));
+            long mismatchVersion = visibleVersion == Long.MAX_VALUE ? 
visibleVersion - 1 : visibleVersion + 1;
+            replica.adminUpdateVersionInfo(mismatchVersion, null, null, 
System.currentTimeMillis());
+
+            List<List<String>> localResult = 
Diagnoser.diagnoseTablet(tabletId);
+            Assert.assertTrue(getDiagnosisInfo(localResult, 
"ReplicaBackendStatus").contains("has no space left"));
+            Assert.assertTrue(getDiagnosisInfo(localResult, 
"ReplicaVersionStatus").contains("does not equal"));
+
+            Config.cloud_unique_id = "diagnose-tablet-cloud-mode-ut";
+            List<List<String>> cloudResult = 
Diagnoser.diagnoseTablet(tabletId);
+            Assert.assertEquals("OK", getDiagnosisInfo(cloudResult, 
"ReplicaBackendStatus"));
+            Assert.assertEquals("OK", getDiagnosisInfo(cloudResult, 
"ReplicaVersionStatus"));
+        } finally {
+            Config.cloud_unique_id = originCloudUniqueId;
+            backend.updateDisks(originalDisks);
+            replica.adminUpdateVersionInfo(originalVersion, null, null, 
System.currentTimeMillis());
+        }
+    }
+
     @Test
     public void test() throws Exception {
         // test colocate tablet repair


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to