This is an automated email from the ASF dual-hosted git repository.
deardeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 58a24298999 [fix](cloud) avoid false tablet diagnosis alarms in cloud
mode (#60805)
58a24298999 is described below
commit 58a24298999b9f64f3b06cdf72b3f06d44c5c82d
Author: deardeng <[email protected]>
AuthorDate: Thu May 21 14:18:34 2026 +0800
[fix](cloud) avoid false tablet diagnosis alarms in cloud mode (#60805)
Problem
SHOW TABLET DIAGNOSIS reports local-storage style errors in cloud
(storage-compute separated) deployments. In cloud mode, replica version
in FE is not a reliable per-backend data freshness signal, and backend
disk pressure does not directly represent object-storage write
availability.
Root cause
Diagnoser applied the same checks for both local and cloud modes:
- ReplicaBackendStatus always evaluated be.diskExceedLimit()
- ReplicaVersionStatus always required replica.version == partition
visible version (and checked lastFailedVersion)
These checks are valid for local replicas but can produce false
positives in cloud mode.
Change
- Added cloud-mode guard via Config.isCloudMode() in Diagnoser.
- In cloud mode, skip disk-exceed-limit check when building
ReplicaBackendStatus.
- In cloud mode, skip replica-version equality and last-failed-version
checks when building ReplicaVersionStatus.
---
.../java/org/apache/doris/system/Diagnoser.java | 22 +++--
.../doris/clone/TabletReplicaTooSlowTest.java | 97 ++++++++++++++++++++++
2 files changed, 110 insertions(+), 9 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
index 46c6abbd18a..189080d2f8e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java
@@ -26,6 +26,7 @@ import org.apache.doris.catalog.Replica;
import org.apache.doris.catalog.Tablet;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.catalog.TabletMeta;
+import org.apache.doris.common.Config;
import com.google.common.collect.Lists;
import org.json.simple.JSONObject;
@@ -114,6 +115,7 @@ public class Diagnoser {
StringBuilder versionErr = new StringBuilder();
StringBuilder statusErr = new StringBuilder();
StringBuilder compactionErr = new StringBuilder();
+ boolean isCloudMode = Config.isCloudMode();
// for local mode, getCachedVisibleVersion return visibleVersion.
// for cloud mode, the replica version is not updated.
long visibleVersion = partition.getCachedVisibleVersion();
@@ -143,20 +145,22 @@ public class Diagnoser {
+ replica.getBackendIdWithoutException() + " is
not query available. ");
break;
}
- if (be.diskExceedLimit()) {
+ if (!isCloudMode && be.diskExceedLimit()) {
backendErr.append("Backend " +
replica.getBackendIdWithoutException() + " has no space left. ");
break;
}
} while (false);
// version
- if (replica.getVersion() != visibleVersion) {
- versionErr.append("Replica on backend " +
replica.getBackendIdWithoutException() + "'s version ("
- + replica.getVersion() + ") does not equal"
- + " to partition visible version (" + visibleVersion +
")");
- } else if (replica.getLastFailedVersion() != -1) {
- versionErr.append("Replica on backend "
- + replica.getBackendIdWithoutException() + "'s last
failed version is "
- + replica.getLastFailedVersion());
+ if (!isCloudMode) {
+ if (replica.getVersion() != visibleVersion) {
+ versionErr.append("Replica on backend " +
replica.getBackendIdWithoutException() + "'s version ("
+ + replica.getVersion() + ") does not equal"
+ + " to partition visible version (" +
visibleVersion + ")");
+ } else if (replica.getLastFailedVersion() != -1) {
+ versionErr.append("Replica on backend "
+ + replica.getBackendIdWithoutException() + "'s
last failed version is "
+ + replica.getLastFailedVersion());
+ }
}
// status
if (!replica.isAlive() || replica.isUserDrop()) {
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
index 7375d817354..400f5bc2bb2 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java
@@ -17,10 +17,15 @@
package org.apache.doris.clone;
+import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.LocalTabletInvertedIndex;
+import org.apache.doris.catalog.MaterializedIndex;
+import org.apache.doris.catalog.OlapTable;
+import org.apache.doris.catalog.Partition;
import org.apache.doris.catalog.Replica;
+import org.apache.doris.catalog.Tablet;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.common.Config;
import org.apache.doris.common.ExceptionChecker;
@@ -163,6 +168,98 @@ public class TabletReplicaTooSlowTest {
Assert.assertTrue(result.get(11).get(1).contains("version count is too
high"));
}
+ private static String getDiagnosisInfo(List<List<String>> rows, String
item) {
+ for (List<String> row : rows) {
+ if (item.equals(row.get(0))) {
+ return row.get(1);
+ }
+ }
+ return "";
+ }
+
+ private static Map<String, TDisk> copyBackendDisks(Backend backend) {
+ Map<String, TDisk> disks = Maps.newHashMap();
+ for (DiskInfo diskInfo : backend.getDisks().values()) {
+ TDisk tDisk = new TDisk();
+ tDisk.setRootPath(diskInfo.getRootPath());
+ tDisk.setDiskTotalCapacity(diskInfo.getTotalCapacityB());
+ tDisk.setDataUsedCapacity(diskInfo.getDataUsedCapacityB());
+ tDisk.setTrashUsedCapacity(diskInfo.getTrashUsedCapacityB());
+ tDisk.setDiskAvailableCapacity(diskInfo.getAvailableCapacityB());
+ tDisk.setUsed(diskInfo.getState() == DiskInfo.DiskState.ONLINE);
+ tDisk.setPathHash(diskInfo.getPathHash());
+ tDisk.setStorageMedium(diskInfo.getStorageMedium());
+ disks.put(tDisk.getRootPath(), tDisk);
+ }
+ return disks;
+ }
+
+ private static Map<String, TDisk> buildExceedLimitDisks(Backend backend) {
+ Map<String, TDisk> disks = Maps.newHashMap();
+ for (DiskInfo diskInfo : backend.getDisks().values()) {
+ TDisk tDisk = new TDisk();
+ tDisk.setRootPath(diskInfo.getRootPath());
+ tDisk.setDiskTotalCapacity(1L);
+ tDisk.setDataUsedCapacity(1L);
+ tDisk.setTrashUsedCapacity(0L);
+ tDisk.setDiskAvailableCapacity(0L);
+ tDisk.setUsed(true);
+ tDisk.setPathHash(diskInfo.getPathHash());
+ tDisk.setStorageMedium(diskInfo.getStorageMedium());
+ disks.put(tDisk.getRootPath(), tDisk);
+ }
+ return disks;
+ }
+
+ @Test
+ public void testDiagnoseTabletCloudModeSkipDiskAndVersionCheck() throws
Exception {
+ String tableName = "tbl_diag_cloud_" + Math.abs(random.nextInt());
+ String createStr = "create table test." + tableName + "\n"
+ + "(k1 date, k2 int)\n"
+ + "distributed by hash(k2) buckets 1\n"
+ + "properties\n"
+ + "(\n"
+ + " \"replication_num\" = \"3\"\n"
+ + ")";
+ ExceptionChecker.expectThrowsNoException(() -> createTable(createStr));
+
+ Database db = Env.getCurrentInternalCatalog().getDbNullable("test");
+ Assert.assertNotNull(db);
+ OlapTable table = (OlapTable) db.getTableNullable(tableName);
+ Assert.assertNotNull(table);
+ Partition partition = table.getAllPartitions().iterator().next();
+ MaterializedIndex index = partition.getBaseIndex();
+ Tablet tablet = index.getTablets().get(0);
+ Replica replica = tablet.getReplicas().get(0);
+ long tabletId = tablet.getId();
+ long visibleVersion = partition.getCachedVisibleVersion();
+ Backend backend =
Env.getCurrentSystemInfo().getBackend(replica.getBackendIdWithoutException());
+ Assert.assertNotNull(backend);
+
+ Map<String, TDisk> originalDisks = copyBackendDisks(backend);
+ String originCloudUniqueId = Config.cloud_unique_id;
+ long originalVersion = replica.getVersion();
+
+ try {
+ backend.updateDisks(buildExceedLimitDisks(backend));
+ long mismatchVersion = visibleVersion == Long.MAX_VALUE ?
visibleVersion - 1 : visibleVersion + 1;
+ replica.adminUpdateVersionInfo(mismatchVersion, null, null,
System.currentTimeMillis());
+
+ List<List<String>> localResult =
Diagnoser.diagnoseTablet(tabletId);
+ Assert.assertTrue(getDiagnosisInfo(localResult,
"ReplicaBackendStatus").contains("has no space left"));
+ Assert.assertTrue(getDiagnosisInfo(localResult,
"ReplicaVersionStatus").contains("does not equal"));
+
+ Config.cloud_unique_id = "diagnose-tablet-cloud-mode-ut";
+ List<List<String>> cloudResult =
Diagnoser.diagnoseTablet(tabletId);
+ Assert.assertEquals("OK", getDiagnosisInfo(cloudResult,
"ReplicaBackendStatus"));
+ Assert.assertEquals("OK", getDiagnosisInfo(cloudResult,
"ReplicaVersionStatus"));
+ } finally {
+ Config.cloud_unique_id = originCloudUniqueId;
+ backend.updateDisks(originalDisks);
+ replica.adminUpdateVersionInfo(originalVersion, null, null,
System.currentTimeMillis());
+ }
+ }
+
@Test
public void test() throws Exception {
// test colocate tablet repair
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]