This is an automated email from the ASF dual-hosted git repository.

jt2594838 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git


The following commit(s) were added to refs/heads/master by this push:
     new ef0d9f8534f Improve IT cluster readiness diagnostics (#17903)
ef0d9f8534f is described below

commit ef0d9f8534fbbb01efb294ce548f53361fac8ccb
Author: Caideyipi <[email protected]>
AuthorDate: Thu Jun 18 16:19:46 2026 +0800

    Improve IT cluster readiness diagnostics (#17903)
    
    * Improve IT cluster readiness diagnostics
    
    * Increase pipe IT cluster readiness retries
---
 .github/workflows/pipe-it.yml                      |  11 ++
 .../org/apache/iotdb/it/env/MultiEnvFactory.java   |   1 +
 .../iotdb/it/env/cluster/env/AbstractEnv.java      | 112 +++++++++++++++++++--
 3 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/pipe-it.yml b/.github/workflows/pipe-it.yml
index 0a4e5e54eda..2a9fbbd1239 100644
--- a/.github/workflows/pipe-it.yml
+++ b/.github/workflows/pipe-it.yml
@@ -85,6 +85,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -pl integration-test \
@@ -188,6 +189,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster 
}} \
               -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -295,6 +297,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -402,6 +405,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -491,6 +495,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -pl integration-test \
@@ -577,6 +582,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -pl integration-test \
@@ -663,6 +669,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -pl integration-test \
@@ -749,6 +756,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }} \
               -pl integration-test \
@@ -852,6 +860,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster 
}} \
               -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -958,6 +967,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster 
}} \
               -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -1047,6 +1057,7 @@ jobs:
               mvn clean verify \
               -P with-integration-tests \
               -DskipUTs \
+              -DintegrationTest.clusterReadyRetryCount=90 \
               -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
               -DClusterConfigurations=${{ matrix.cluster1 }},${{ 
matrix.cluster2 }},${{ matrix.cluster3 }} \
               -pl integration-test \
diff --git 
a/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java 
b/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
index 5832f1c485b..f2e5f9c1f90 100644
--- 
a/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
+++ 
b/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
@@ -51,6 +51,7 @@ public class MultiEnvFactory {
   /** Create several environments according to the specific number. */
   public static void createEnv(final int num) {
     // Not judge EnvType for individual test convenience
+    envList.clear();
     final long startTime = System.currentTimeMillis();
     for (int i = 0; i < num; ++i) {
       try {
diff --git 
a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
 
b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
index 77ccc25d936..82513929b69 100644
--- 
a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
+++ 
b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
@@ -96,6 +96,9 @@ import static org.apache.iotdb.jdbc.Config.VERSION;
 
 public abstract class AbstractEnv implements BaseEnv {
   private static final Logger logger = IoTDBTestLogger.logger;
+  private static final int DEFAULT_CLUSTER_READY_RETRY_COUNT = 30;
+  private static final String CLUSTER_READY_RETRY_COUNT_PROPERTY =
+      "integrationTest.clusterReadyRetryCount";
 
   private final Random rand = new Random();
   protected List<ConfigNodeWrapper> configNodeWrapperList = 
Collections.emptyList();
@@ -104,7 +107,7 @@ public abstract class AbstractEnv implements BaseEnv {
   protected String testMethodName = null;
   protected int index = 0;
   protected long startTime;
-  protected int retryCount = 30;
+  protected int retryCount = getDefaultClusterReadyRetryCount();
   private IClientManager<TEndPoint, SyncConfigNodeIServiceClient> 
clientManager;
   private List<String> configNodeKillPoints = new ArrayList<>();
   private List<String> dataNodeKillPoints = new ArrayList<>();
@@ -128,6 +131,12 @@ public abstract class AbstractEnv implements BaseEnv {
     this.clusterConfig = new MppClusterConfig();
   }
 
+  private static int getDefaultClusterReadyRetryCount() {
+    final int configuredRetryCount =
+        Integer.getInteger(CLUSTER_READY_RETRY_COUNT_PROPERTY, 
DEFAULT_CLUSTER_READY_RETRY_COUNT);
+    return configuredRetryCount > 0 ? configuredRetryCount : 
DEFAULT_CLUSTER_READY_RETRY_COUNT;
+  }
+
   @Override
   public ClusterConfig getConfig() {
     return clusterConfig;
@@ -401,12 +410,14 @@ public abstract class AbstractEnv implements BaseEnv {
         processStatusMap.clear();
 
         showClusterResp = client.showCluster();
+        showClusterStatus = showClusterResp.getStatus();
+        actualNodeSize = showClusterResp.getNodeStatusSize();
+        lastNodeStatus = showClusterResp.getNodeStatus();
 
         // Check resp status
         if (showClusterResp.getStatus().getCode() != 
TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
           passed = false;
           showClusterPassed = false;
-          showClusterStatus = showClusterResp.getStatus();
         }
 
         // Check the number of nodes
@@ -416,7 +427,6 @@ public abstract class AbstractEnv implements BaseEnv {
                 + extraNodeWrappers.size()) {
           passed = false;
           nodeSizePassed = false;
-          actualNodeSize = showClusterResp.getNodeStatusSize();
         }
 
         // Check the status of nodes
@@ -424,7 +434,6 @@ public abstract class AbstractEnv implements BaseEnv {
           passed = nodeStatusCheck.test(showClusterResp.getNodeStatus());
           if (!passed) {
             nodeStatusPassed = false;
-            lastNodeStatus = showClusterResp.getNodeStatus();
           }
         }
 
@@ -510,8 +519,93 @@ public abstract class AbstractEnv implements BaseEnv {
       }
     }
 
+    dumpTestJVMSnapshotQuietly("cluster status check failed");
     throw new AssertionError(
-        String.format("After %d times retry, the cluster can't work!", 
retryCount));
+        buildClusterStatusFailureMessage(
+            showClusterPassed,
+            nodeSizePassed,
+            nodeStatusPassed,
+            processStatusPassed,
+            showClusterStatus,
+            actualNodeSize,
+            lastNodeStatus,
+            processStatusMap,
+            lastException));
+  }
+
+  private String buildClusterStatusFailureMessage(
+      final boolean showClusterPassed,
+      final boolean nodeSizePassed,
+      final boolean nodeStatusPassed,
+      final boolean processStatusPassed,
+      final TSStatus showClusterStatus,
+      final int actualNodeSize,
+      final Map<Integer, String> lastNodeStatus,
+      final Map<AbstractNodeWrapper, Integer> processStatusMap,
+      final Exception lastException) {
+    final StringBuilder builder =
+        new StringBuilder(
+            String.format("After %d times retry, the cluster status check 
failed", retryCount));
+    builder
+        .append(": showClusterPassed=")
+        .append(showClusterPassed)
+        .append(", nodeSizePassed=")
+        .append(nodeSizePassed)
+        .append(", nodeStatusPassed=")
+        .append(nodeStatusPassed)
+        .append(", processStatusPassed=")
+        .append(processStatusPassed)
+        .append(", expectedNodeSize=")
+        .append(
+            configNodeWrapperList.size() + dataNodeWrapperList.size() + 
aiNodeWrapperList.size())
+        .append(", actualNodeSize=")
+        .append(actualNodeSize);
+    if (showClusterStatus != null) {
+      builder.append(", showClusterStatus=").append(showClusterStatus);
+    }
+    if (lastNodeStatus != null) {
+      builder.append(", lastNodeStatus=").append(lastNodeStatus);
+    }
+    if (!processStatusMap.isEmpty()) {
+      builder.append(", 
processStatus=").append(formatProcessStatus(processStatusMap));
+    }
+    if (lastException != null) {
+      builder
+          .append(", lastException=")
+          .append(lastException.getClass().getName())
+          .append(": ")
+          .append(lastException.getMessage());
+    }
+    builder.append(", logDirs=").append(getClusterLogDirs());
+    return builder.toString();
+  }
+
+  private Map<String, Integer> formatProcessStatus(
+      final Map<AbstractNodeWrapper, Integer> processStatusMap) {
+    final Map<String, Integer> result = new LinkedHashMap<>();
+    processStatusMap.forEach(
+        (nodeWrapper, statusCode) -> result.put(nodeWrapper.getId(), 
statusCode));
+    return result;
+  }
+
+  private List<String> getClusterLogDirs() {
+    final List<AbstractNodeWrapper> allNodeWrappers = new ArrayList<>();
+    allNodeWrappers.addAll(configNodeWrapperList);
+    allNodeWrappers.addAll(dataNodeWrapperList);
+    allNodeWrappers.addAll(aiNodeWrapperList);
+    return allNodeWrappers.stream()
+        .map(AbstractNodeWrapper::getLogDirPath)
+        .distinct()
+        .collect(Collectors.toList());
+  }
+
+  private void dumpTestJVMSnapshotQuietly(final String reason) {
+    try {
+      logger.info("Dumping test JVM snapshots because {}.", reason);
+      dumpTestJVMSnapshot();
+    } catch (final Exception e) {
+      logger.warn("Failed to dump test JVM snapshots after {}", reason, e);
+    }
   }
 
   private void handleProcessStatus(Map<AbstractNodeWrapper, Integer> 
processStatusMap) {
@@ -956,6 +1050,7 @@ public abstract class AbstractEnv implements BaseEnv {
             .collect(Collectors.toList());
     final RequestDelegate<Void> testDelegate =
         new ParallelRequestDelegate<>(endpoints, NODE_START_TIMEOUT, this);
+    final Map<String, String> lastConnectionFailures = 
Collections.synchronizedMap(new HashMap<>());
     for (final DataNodeWrapper dataNode : dataNodeWrapperList) {
       final String dataNodeEndpoint = dataNode.getIpAndPortString();
       testDelegate.addRequest(
@@ -974,6 +1069,8 @@ public abstract class AbstractEnv implements BaseEnv {
                 return null;
               } catch (final Exception e) {
                 lastException = e;
+                lastConnectionFailures.put(
+                    dataNodeEndpoint, e.getClass().getName() + ": " + 
e.getMessage());
                 TimeUnit.SECONDS.sleep(1L);
               }
             }
@@ -987,8 +1084,11 @@ public abstract class AbstractEnv implements BaseEnv {
       testDelegate.requestAll();
     } catch (final Exception e) {
       logger.error("exception in test Cluster with RPC, message: {}", 
e.getMessage(), e);
+      dumpTestJVMSnapshotQuietly("JDBC connection check failed");
       throw new AssertionError(
-          String.format("After %d times retry, the cluster can't work!", 
retryCount));
+          String.format(
+              "After %d times retry, JDBC connections to DataNodes are not 
ready. endpoints=%s, lastConnectionFailures=%s, logDirs=%s",
+              retryCount, endpoints, lastConnectionFailures, 
getClusterLogDirs()));
     }
   }
 

Reply via email to