YARN-4243. Add retry on establishing Zookeeper conenction in 
EmbeddedElectorService#serviceInit. Contributed by Xuan Gong.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/0fce5f9a
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/0fce5f9a
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/0fce5f9a

Branch: refs/heads/HDFS-8966
Commit: 0fce5f9a496925f0d53ea6c14318c9b513de9882
Parents: 960201b
Author: Junping Du <junping...@apache.org>
Authored: Thu Oct 22 13:41:09 2015 -0700
Committer: Junping Du <junping...@apache.org>
Committed: Thu Oct 22 13:41:09 2015 -0700

----------------------------------------------------------------------
 .../apache/hadoop/ha/ActiveStandbyElector.java  | 53 ++++++++++++++++++--
 hadoop-yarn-project/CHANGES.txt                 |  3 ++
 .../hadoop/yarn/conf/YarnConfiguration.java     |  4 ++
 .../src/main/resources/yarn-default.xml         |  7 +++
 .../resourcemanager/EmbeddedElectorService.java |  9 ++--
 5 files changed, 68 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
index fcbcfdf..cb2e081 100644
--- 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
+++ 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
@@ -208,8 +208,49 @@ public class ActiveStandbyElector implements StatCallback, 
StringCallback {
    */
   public ActiveStandbyElector(String zookeeperHostPorts,
       int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
-      List<ZKAuthInfo> authInfo,
-      ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
+      List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
+      int maxRetryNum) throws IOException, HadoopIllegalArgumentException,
+      KeeperException {
+    this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl,
+      authInfo, app, maxRetryNum, true);
+  }
+
+  /**
+   * Create a new ActiveStandbyElector object <br/>
+   * The elector is created by providing to it the Zookeeper configuration, the
+   * parent znode under which to create the znode and a reference to the
+   * callback interface. <br/>
+   * The parent znode name must be the same for all service instances and
+   * different across services. <br/>
+   * After the leader has been lost, a new leader will be elected after the
+   * session timeout expires. Hence, the app must set this parameter based on
+   * its needs for failure response time. The session timeout must be greater
+   * than the Zookeeper disconnect timeout and is recommended to be 3X that
+   * value to enable Zookeeper to retry transient disconnections. Setting a 
very
+   * short session timeout may result in frequent transitions between active 
and
+   * standby states during issues like network outages/GS pauses.
+   * 
+   * @param zookeeperHostPorts
+   *          ZooKeeper hostPort for all ZooKeeper servers
+   * @param zookeeperSessionTimeout
+   *          ZooKeeper session timeout
+   * @param parentZnodeName
+   *          znode under which to create the lock
+   * @param acl
+   *          ZooKeeper ACL's
+   * @param authInfo a list of authentication credentials to add to the
+   *                 ZK connection
+   * @param app
+   *          reference to callback interface object
+   * @param failFast
+   *          whether need to add the retry when establishing ZK connection.
+   * @throws IOException
+   * @throws HadoopIllegalArgumentException
+   */
+  public ActiveStandbyElector(String zookeeperHostPorts,
+      int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
+      List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
+      int maxRetryNum, boolean failFast) throws IOException,
       HadoopIllegalArgumentException, KeeperException {
     if (app == null || acl == null || parentZnodeName == null
         || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
@@ -225,8 +266,12 @@ public class ActiveStandbyElector implements StatCallback, 
StringCallback {
     zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
     this.maxRetryNum = maxRetryNum;
 
-    // createConnection for future API calls
-    createConnection();
+    // establish the ZK Connection for future API calls
+    if (failFast) {
+      createConnection();
+    } else {
+      reEstablishSession();
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 024255c..9f35307 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED
     YARN-3985. Make ReservationSystem persist state using RMStateStore
     reservation APIs. (adhoot via asuresh)
 
+    YARN-4243. Add retry on establishing Zookeeper conenction in 
+    EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du)
+
   OPTIMIZATIONS
 
     YARN-3339. TestDockerContainerExecutor should pull a single image and not

http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 3e89259..913b5df 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -531,6 +531,10 @@ public class YarnConfiguration extends Configuration {
   public static final int
       DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
 
+  /** number of zookeeper operation retry times in ActiveStandbyElector */
+  public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX
+      + "failover-controller.active-standby-elector.zk.retries";
+
   ////////////////////////////////
   // RM state store configs
   ////////////////////////////////

http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 5dc4590..c6ffe18 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -389,6 +389,13 @@
   </property>
 
   <property>
+    <description>When automatic failover is enabled, number of zookeeper
+      operation retry times in ActiveStandbyElector</description>
+    
<name>yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries</name>
+    <!--<value>3</value>-->
+  </property>
+
+  <property>
     <description>The maximum number of completed applications RM state
     store keeps, less than or equals to 
${yarn.resourcemanager.max-completed-applications}.
     By default, it equals to 
${yarn.resourcemanager.max-completed-applications}.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
index 73bdca0..72327e8 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
@@ -86,11 +86,12 @@ public class EmbeddedElectorService extends AbstractService
     List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
     List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
 
-    int maxRetryNum = conf.getInt(
-        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
-        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
+    int maxRetryNum =
+        conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf
+          .getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
+            CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT));
     elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
-        electionZNode, zkAcls, zkAuths, this, maxRetryNum);
+        electionZNode, zkAcls, zkAuths, this, maxRetryNum, false);
 
     elector.ensureParentZNode();
     if (!isParentZnodeSafe(clusterId)) {

Reply via email to