This is an automated email from the ASF dual-hosted git repository. sammichen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
The following commit(s) were added to refs/heads/master by this push: new 642d660 HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373) 642d660 is described below commit 642d6602c417406485be6a09e29c099de88c854b Author: GlenGeng <gleng...@tencent.com> AuthorDate: Fri Sep 4 15:11:41 2020 +0800 HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373) --- .../main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java | 5 +++++ hadoop-hdds/common/src/main/resources/ozone-default.xml | 10 ++++++++++ .../container/common/statemachine/SCMConnectionManager.java | 9 ++++++--- .../java/org/apache/hadoop/hdds/utils/HddsServerUtil.java | 13 +++++++++++++ hadoop-ozone/dist/src/main/compose/testlib.sh | 8 +++++--- 5 files changed, 39 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java index 4e624c6..672b440 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java @@ -241,6 +241,11 @@ public final class ScmConfigKeys { public static final String OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT = "1s"; + public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT = + "ozone.scm.heartbeat.rpc-retry-count"; + public static final int OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT = + 15; + /** * Defines how frequently we will log the missing of heartbeat to a specific * SCM. In the default case we will write a warning message for each 10 diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index 5770448..f16ff3f 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -978,6 +978,16 @@ </description> </property> <property> + <name>ozone.scm.heartbeat.rpc-retry-count</name> + <value>15</value> + <tag>OZONE, MANAGEMENT</tag> + <description> + Retry count for the RPC from Datanode to SCM. The rpc-retry-interval + is 1s. Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval) + is less than hdds.heartbeat.interval. + </description> + </property> + <property> <name>ozone.scm.heartbeat.thread.interval</name> <value>3s</value> <tag>OZONE, MANAGEMENT</tag> diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java index ebc53c9..c7dd9c6 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java @@ -46,6 +46,7 @@ import org.apache.hadoop.security.UserGroupInformation; import static java.util.Collections.unmodifiableList; import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcTimeOutInMilliseconds; +import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryCount; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -149,7 +150,8 @@ public class SCMConnectionManager RPC.getProtocolVersion(StorageContainerDatanodeProtocolPB.class); RetryPolicy retryPolicy = - RetryPolicies.retryForeverWithFixedSleep( + RetryPolicies.retryUpToMaximumCountWithFixedSleep( + getScmRpcRetryCount(conf), 1000, TimeUnit.MILLISECONDS); StorageContainerDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy( @@ -193,8 +195,9 @@ public class SCMConnectionManager RPC.getProtocolVersion(ReconDatanodeProtocolPB.class); RetryPolicy retryPolicy = - RetryPolicies.retryUpToMaximumCountWithFixedSleep(10, - 60000, TimeUnit.MILLISECONDS); + RetryPolicies.retryUpToMaximumCountWithFixedSleep( + getScmRpcRetryCount(conf), + 1000, TimeUnit.MILLISECONDS); ReconDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy( ReconDatanodeProtocolPB.class, version, address, UserGroupInformation.getCurrentUser(), hadoopConfig, diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java index 8e7f326..13e08a1 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java @@ -65,6 +65,8 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_LOG_W import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL_DEFAULT; import static org.apache.hadoop.hdds.server.ServerUtils.sanitizeUserArgs; @@ -325,6 +327,17 @@ public final class HddsServerUtil { } /** + * Max retry count of rpcProxy for EndpointStateMachine of SCM. + * + * @param conf - Ozone Config + * @return - Max retry count. + */ + public static int getScmRpcRetryCount(ConfigurationSource conf) { + return conf.getInt(OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT, + OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT); + } + + /** * Log Warn interval. * * @param conf - Ozone Config diff --git a/hadoop-ozone/dist/src/main/compose/testlib.sh b/hadoop-ozone/dist/src/main/compose/testlib.sh index db449b9..b122479 100755 --- a/hadoop-ozone/dist/src/main/compose/testlib.sh +++ b/hadoop-ozone/dist/src/main/compose/testlib.sh @@ -60,7 +60,7 @@ find_tests(){ echo $tests } -## @description wait until safemode exit (or 180 seconds) +## @description wait until safemode exit (or 240 seconds) wait_for_safemode_exit(){ # version-dependent : ${OZONE_SAFEMODE_STATUS_COMMAND:=ozone admin safemode status --verbose} @@ -68,8 +68,8 @@ wait_for_safemode_exit(){ #Reset the timer SECONDS=0 - #Don't give it up until 180 seconds - while [[ $SECONDS -lt 180 ]]; do + #Don't give it up until 240 seconds + while [[ $SECONDS -lt 240 ]]; do #This line checks the safemode status in scm local command="${OZONE_SAFEMODE_STATUS_COMMAND}" @@ -79,6 +79,8 @@ wait_for_safemode_exit(){ status=$(docker-compose exec -T scm bash -c "$command") fi + echo "SECONDS: $SECONDS" + echo $status if [[ "$status" ]]; then if [[ ${status} == "SCM is out of safe mode." ]]; then --------------------------------------------------------------------- To unsubscribe, e-mail: ozone-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: ozone-commits-h...@hadoop.apache.org