This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new c7c6bc9d61e HDDS-14725. Print retry messages to stderr when SCMs are
unavailable (#9834)
c7c6bc9d61e is described below
commit c7c6bc9d61eb64936e812d0b98ebce8168698f34
Author: Gargi Jaiswal <[email protected]>
AuthorDate: Tue Mar 3 17:20:14 2026 +0530
HDDS-14725. Print retry messages to stderr when SCMs are unavailable (#9834)
---
.../scm/proxy/SCMFailoverProxyProviderBase.java | 47 +++++++++++++++++++++-
.../hadoop/hdds/scm/TestFailoverWithSCMHA.java | 43 ++++++++++++++++++++
2 files changed, 88 insertions(+), 2 deletions(-)
diff --git
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
index 272db7a04ae..05e06e57e1b 100644
---
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
+++
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java
@@ -39,6 +39,7 @@
import org.apache.hadoop.io.retry.FailoverProxyProvider;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.io.retry.RetryPolicy.RetryAction.RetryDecision;
import org.apache.hadoop.ipc_.ProtobufRpcEngine;
import org.apache.hadoop.ipc_.RPC;
import org.apache.hadoop.net.NetUtils;
@@ -333,13 +334,20 @@ public RetryAction shouldRetry(Exception e, int retry,
}
}
+ RetryPolicy.RetryAction retryAction = SCMHAUtils.getRetryAction(
+ failover, retry, e, maxRetryCount, getRetryInterval());
+
+ if (retryAction.action == RetryDecision.RETRY
+ || retryAction.action == RetryDecision.FAILOVER_AND_RETRY) {
+ printRetryMessage(e, failover, retryAction.delayMillis);
+ }
+
if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
setUpdatedLeaderNodeID();
} else {
performFailoverToAssignedLeader(null, e);
}
- return SCMHAUtils.getRetryAction(failover, retry, e, maxRetryCount,
- getRetryInterval());
+ return retryAction;
}
};
}
@@ -347,4 +355,39 @@ public RetryAction shouldRetry(Exception e, int retry,
public synchronized void setUpdatedLeaderNodeID() {
this.updatedLeaderNodeID = getCurrentProxySCMNodeId();
}
+
+ /**
+ * Print user-facing retry message to stderr.
+ * Shows connection attempts and failover progress.
+ * Only called when a retry will actually occur.
+ *
+ * @param exception the exception that triggered the retry
+ * @param failoverCount the number of failover attempts made so far
+ * @param delayMillis the delay before the next retry attempt
+ */
+ private void printRetryMessage(Exception exception, int failoverCount,
+ long delayMillis) {
+ Throwable cause = exception.getCause();
+ String exceptionType = (cause != null ? cause :
exception).getClass().getSimpleName();
+
+ // Extract concise error message
+ String errorMsg;
+ if (cause != null && cause.getMessage() != null) {
+ String fullMsg = cause.getMessage();
+ int colonIndex = fullMsg.indexOf(':');
+ errorMsg = colonIndex > 0 && colonIndex < 100 ?
+ fullMsg.substring(0, colonIndex) : fullMsg;
+ } else {
+ errorMsg = exception.getMessage();
+ }
+
+ System.err.printf("%s: %s, while invoking %s over %s. " +
+ "Retrying in %dms after %d failover attempt(s).%n",
+ exceptionType,
+ errorMsg,
+ protocolClass.getSimpleName(),
+ getCurrentProxySCMNodeId(),
+ delayMillis,
+ failoverCount);
+ }
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestFailoverWithSCMHA.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestFailoverWithSCMHA.java
index a4318a2cc69..ac80c4bd689 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestFailoverWithSCMHA.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestFailoverWithSCMHA.java
@@ -25,6 +25,8 @@
import com.google.protobuf.ByteString;
import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
@@ -45,6 +47,7 @@
import org.apache.hadoop.hdds.tracing.TracingUtil;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl;
+import org.apache.hadoop.ozone.admin.OzoneAdmin;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ozone.test.GenericTestUtils.LogCapturer;
import org.junit.jupiter.api.AfterEach;
@@ -66,6 +69,14 @@ public class TestFailoverWithSCMHA {
private static final long SNAPSHOT_THRESHOLD = 5;
+ private static final String[][] OZONE_ADMIN_SCM_COMMANDS = {
+ {"datanode", "list"},
+ {"pipeline", "list"},
+ {"scm", "roles"},
+ {"container", "list"},
+ {"safemode", "status"}
+ };
+
@BeforeEach
public void init() throws Exception {
conf = new OzoneConfiguration();
@@ -214,6 +225,38 @@ public void
testContainerBalancerPersistsConfigurationInAllSCMs()
}
}
+ /**
+ * Verifies that when SCMs are unavailable, the CLI shows retry messages
+ * on stderr before eventually failing for all SCM-querying commands.
+ */
+ @Test
+ public void testRetryMessageShownWhenScmUnavailable() throws Exception {
+ SCMClientConfig scmClientConfig = conf.getObject(SCMClientConfig.class);
+ scmClientConfig.setRetryCount(2);
+ scmClientConfig.setRetryInterval(50);
+ conf.setFromObject(scmClientConfig);
+
+ Map<String, String> configOverrides = new HashMap<>();
+ cluster.getConf().forEach(entry ->
+ configOverrides.put(entry.getKey(), entry.getValue()));
+
+ cluster.shutdown();
+ cluster = null;
+
+ OzoneAdmin ozoneAdmin = new OzoneAdmin();
+ ozoneAdmin.setConfigurationOverrides(configOverrides);
+
+ for (String[] args : OZONE_ADMIN_SCM_COMMANDS) {
+ try (GenericTestUtils.PrintStreamCapturer err =
GenericTestUtils.captureErr()) {
+ ozoneAdmin.execute(args);
+ String stderrOutput = err.get();
+
+ // Retry message format: "... Retrying in Xms after N failover
attempt(s)."
+ assertThat(stderrOutput.toLowerCase()).contains("retrying in",
"failover attempt(s)");
+ }
+ }
+ }
+
static StorageContainerManager getLeader(MiniOzoneHAClusterImpl impl) {
for (StorageContainerManager scm : impl.getStorageContainerManagers()) {
if (scm.checkLeader()) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]