This is an automated email from the ASF dual-hosted git repository.
jt2594838 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new ef0d9f8534f Improve IT cluster readiness diagnostics (#17903)
ef0d9f8534f is described below
commit ef0d9f8534fbbb01efb294ce548f53361fac8ccb
Author: Caideyipi <[email protected]>
AuthorDate: Thu Jun 18 16:19:46 2026 +0800
Improve IT cluster readiness diagnostics (#17903)
* Improve IT cluster readiness diagnostics
* Increase pipe IT cluster readiness retries
---
.github/workflows/pipe-it.yml | 11 ++
.../org/apache/iotdb/it/env/MultiEnvFactory.java | 1 +
.../iotdb/it/env/cluster/env/AbstractEnv.java | 112 +++++++++++++++++++--
3 files changed, 118 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/pipe-it.yml b/.github/workflows/pipe-it.yml
index 0a4e5e54eda..2a9fbbd1239 100644
--- a/.github/workflows/pipe-it.yml
+++ b/.github/workflows/pipe-it.yml
@@ -85,6 +85,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-pl integration-test \
@@ -188,6 +189,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster
}} \
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -295,6 +297,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -402,6 +405,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -491,6 +495,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-pl integration-test \
@@ -577,6 +582,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-pl integration-test \
@@ -663,6 +669,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-pl integration-test \
@@ -749,6 +756,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }} \
-pl integration-test \
@@ -852,6 +860,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster
}} \
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -958,6 +967,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster
}} \
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -1047,6 +1057,7 @@ jobs:
mvn clean verify \
-P with-integration-tests \
-DskipUTs \
+ -DintegrationTest.clusterReadyRetryCount=90 \
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256
-DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
-DClusterConfigurations=${{ matrix.cluster1 }},${{
matrix.cluster2 }},${{ matrix.cluster3 }} \
-pl integration-test \
diff --git
a/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
b/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
index 5832f1c485b..f2e5f9c1f90 100644
---
a/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
+++
b/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java
@@ -51,6 +51,7 @@ public class MultiEnvFactory {
/** Create several environments according to the specific number. */
public static void createEnv(final int num) {
// Not judge EnvType for individual test convenience
+ envList.clear();
final long startTime = System.currentTimeMillis();
for (int i = 0; i < num; ++i) {
try {
diff --git
a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
index 77ccc25d936..82513929b69 100644
---
a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
+++
b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java
@@ -96,6 +96,9 @@ import static org.apache.iotdb.jdbc.Config.VERSION;
public abstract class AbstractEnv implements BaseEnv {
private static final Logger logger = IoTDBTestLogger.logger;
+ private static final int DEFAULT_CLUSTER_READY_RETRY_COUNT = 30;
+ private static final String CLUSTER_READY_RETRY_COUNT_PROPERTY =
+ "integrationTest.clusterReadyRetryCount";
private final Random rand = new Random();
protected List<ConfigNodeWrapper> configNodeWrapperList =
Collections.emptyList();
@@ -104,7 +107,7 @@ public abstract class AbstractEnv implements BaseEnv {
protected String testMethodName = null;
protected int index = 0;
protected long startTime;
- protected int retryCount = 30;
+ protected int retryCount = getDefaultClusterReadyRetryCount();
private IClientManager<TEndPoint, SyncConfigNodeIServiceClient>
clientManager;
private List<String> configNodeKillPoints = new ArrayList<>();
private List<String> dataNodeKillPoints = new ArrayList<>();
@@ -128,6 +131,12 @@ public abstract class AbstractEnv implements BaseEnv {
this.clusterConfig = new MppClusterConfig();
}
+ private static int getDefaultClusterReadyRetryCount() {
+ final int configuredRetryCount =
+ Integer.getInteger(CLUSTER_READY_RETRY_COUNT_PROPERTY,
DEFAULT_CLUSTER_READY_RETRY_COUNT);
+ return configuredRetryCount > 0 ? configuredRetryCount :
DEFAULT_CLUSTER_READY_RETRY_COUNT;
+ }
+
@Override
public ClusterConfig getConfig() {
return clusterConfig;
@@ -401,12 +410,14 @@ public abstract class AbstractEnv implements BaseEnv {
processStatusMap.clear();
showClusterResp = client.showCluster();
+ showClusterStatus = showClusterResp.getStatus();
+ actualNodeSize = showClusterResp.getNodeStatusSize();
+ lastNodeStatus = showClusterResp.getNodeStatus();
// Check resp status
if (showClusterResp.getStatus().getCode() !=
TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
passed = false;
showClusterPassed = false;
- showClusterStatus = showClusterResp.getStatus();
}
// Check the number of nodes
@@ -416,7 +427,6 @@ public abstract class AbstractEnv implements BaseEnv {
+ extraNodeWrappers.size()) {
passed = false;
nodeSizePassed = false;
- actualNodeSize = showClusterResp.getNodeStatusSize();
}
// Check the status of nodes
@@ -424,7 +434,6 @@ public abstract class AbstractEnv implements BaseEnv {
passed = nodeStatusCheck.test(showClusterResp.getNodeStatus());
if (!passed) {
nodeStatusPassed = false;
- lastNodeStatus = showClusterResp.getNodeStatus();
}
}
@@ -510,8 +519,93 @@ public abstract class AbstractEnv implements BaseEnv {
}
}
+ dumpTestJVMSnapshotQuietly("cluster status check failed");
throw new AssertionError(
- String.format("After %d times retry, the cluster can't work!",
retryCount));
+ buildClusterStatusFailureMessage(
+ showClusterPassed,
+ nodeSizePassed,
+ nodeStatusPassed,
+ processStatusPassed,
+ showClusterStatus,
+ actualNodeSize,
+ lastNodeStatus,
+ processStatusMap,
+ lastException));
+ }
+
+ private String buildClusterStatusFailureMessage(
+ final boolean showClusterPassed,
+ final boolean nodeSizePassed,
+ final boolean nodeStatusPassed,
+ final boolean processStatusPassed,
+ final TSStatus showClusterStatus,
+ final int actualNodeSize,
+ final Map<Integer, String> lastNodeStatus,
+ final Map<AbstractNodeWrapper, Integer> processStatusMap,
+ final Exception lastException) {
+ final StringBuilder builder =
+ new StringBuilder(
+ String.format("After %d times retry, the cluster status check
failed", retryCount));
+ builder
+ .append(": showClusterPassed=")
+ .append(showClusterPassed)
+ .append(", nodeSizePassed=")
+ .append(nodeSizePassed)
+ .append(", nodeStatusPassed=")
+ .append(nodeStatusPassed)
+ .append(", processStatusPassed=")
+ .append(processStatusPassed)
+ .append(", expectedNodeSize=")
+ .append(
+ configNodeWrapperList.size() + dataNodeWrapperList.size() +
aiNodeWrapperList.size())
+ .append(", actualNodeSize=")
+ .append(actualNodeSize);
+ if (showClusterStatus != null) {
+ builder.append(", showClusterStatus=").append(showClusterStatus);
+ }
+ if (lastNodeStatus != null) {
+ builder.append(", lastNodeStatus=").append(lastNodeStatus);
+ }
+ if (!processStatusMap.isEmpty()) {
+ builder.append(",
processStatus=").append(formatProcessStatus(processStatusMap));
+ }
+ if (lastException != null) {
+ builder
+ .append(", lastException=")
+ .append(lastException.getClass().getName())
+ .append(": ")
+ .append(lastException.getMessage());
+ }
+ builder.append(", logDirs=").append(getClusterLogDirs());
+ return builder.toString();
+ }
+
+ private Map<String, Integer> formatProcessStatus(
+ final Map<AbstractNodeWrapper, Integer> processStatusMap) {
+ final Map<String, Integer> result = new LinkedHashMap<>();
+ processStatusMap.forEach(
+ (nodeWrapper, statusCode) -> result.put(nodeWrapper.getId(),
statusCode));
+ return result;
+ }
+
+ private List<String> getClusterLogDirs() {
+ final List<AbstractNodeWrapper> allNodeWrappers = new ArrayList<>();
+ allNodeWrappers.addAll(configNodeWrapperList);
+ allNodeWrappers.addAll(dataNodeWrapperList);
+ allNodeWrappers.addAll(aiNodeWrapperList);
+ return allNodeWrappers.stream()
+ .map(AbstractNodeWrapper::getLogDirPath)
+ .distinct()
+ .collect(Collectors.toList());
+ }
+
+ private void dumpTestJVMSnapshotQuietly(final String reason) {
+ try {
+ logger.info("Dumping test JVM snapshots because {}.", reason);
+ dumpTestJVMSnapshot();
+ } catch (final Exception e) {
+ logger.warn("Failed to dump test JVM snapshots after {}", reason, e);
+ }
}
private void handleProcessStatus(Map<AbstractNodeWrapper, Integer>
processStatusMap) {
@@ -956,6 +1050,7 @@ public abstract class AbstractEnv implements BaseEnv {
.collect(Collectors.toList());
final RequestDelegate<Void> testDelegate =
new ParallelRequestDelegate<>(endpoints, NODE_START_TIMEOUT, this);
+ final Map<String, String> lastConnectionFailures =
Collections.synchronizedMap(new HashMap<>());
for (final DataNodeWrapper dataNode : dataNodeWrapperList) {
final String dataNodeEndpoint = dataNode.getIpAndPortString();
testDelegate.addRequest(
@@ -974,6 +1069,8 @@ public abstract class AbstractEnv implements BaseEnv {
return null;
} catch (final Exception e) {
lastException = e;
+ lastConnectionFailures.put(
+ dataNodeEndpoint, e.getClass().getName() + ": " +
e.getMessage());
TimeUnit.SECONDS.sleep(1L);
}
}
@@ -987,8 +1084,11 @@ public abstract class AbstractEnv implements BaseEnv {
testDelegate.requestAll();
} catch (final Exception e) {
logger.error("exception in test Cluster with RPC, message: {}",
e.getMessage(), e);
+ dumpTestJVMSnapshotQuietly("JDBC connection check failed");
throw new AssertionError(
- String.format("After %d times retry, the cluster can't work!",
retryCount));
+ String.format(
+ "After %d times retry, JDBC connections to DataNodes are not
ready. endpoints=%s, lastConnectionFailures=%s, logDirs=%s",
+ retryCount, endpoints, lastConnectionFailures,
getClusterLogDirs()));
}
}