Repository: hbase
Updated Branches:
  refs/heads/branch-2.1 0d008b479 -> fea75742b


HBASE-21164 reportForDuty should do backoff rather than retry

Remove unused methods from Sleeper (its ok, its @Private).
Remove notion of startTime from Sleeper handling (it is is unused).
Allow passing in how long to sleep so can maintain externally.
In HRS, use a RetryCounter to calculate backoff sleep time for when
reportForDuty is failing against a struggling Master.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/fea75742
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/fea75742
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/fea75742

Branch: refs/heads/branch-2.1
Commit: fea75742b4b9dc476bb2b83efcb3c6358f46bcc2
Parents: 0d008b4
Author: Mingliang Liu <lium...@apache.org>
Authored: Thu Sep 6 23:01:52 2018 -0700
Committer: Michael Stack <st...@apache.org>
Committed: Tue Sep 25 11:31:39 2018 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/util/Sleeper.java   | 31 +++----
 .../org/apache/hadoop/hbase/master/HMaster.java |  3 +-
 .../hbase/regionserver/HRegionServer.java       | 16 ++--
 .../TestRegionServerReportForDuty.java          | 88 ++++++++++++++++++++
 4 files changed, 111 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/fea75742/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java
----------------------------------------------------------------------
diff --git 
a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java 
b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java
index 7d4d692..93ef08c 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java
@@ -50,13 +50,6 @@ public class Sleeper {
   }
 
   /**
-   * Sleep for period.
-   */
-  public void sleep() {
-    sleep(System.currentTimeMillis());
-  }
-
-  /**
    * If currently asleep, stops sleeping; if not asleep, will skip the next
    * sleep cycle.
    */
@@ -68,28 +61,24 @@ public class Sleeper {
   }
 
   /**
-   * Sleep for period adjusted by passed <code>startTime</code>
-   * @param startTime Time some task started previous to now.  Time to sleep
-   * will be docked current time minus passed <code>startTime</code>.
+   * Sleep for period.
    */
-  public void sleep(final long startTime) {
+  public void sleep() {
+    sleep(this.period);
+  }
+
+  public void sleep(long sleepTime) {
     if (this.stopper.isStopped()) {
       return;
     }
     long now = System.currentTimeMillis();
-    long waitTime = this.period - (now - startTime);
-    if (waitTime > this.period) {
-      LOG.warn("Calculated wait time > " + this.period +
-        "; setting to this.period: " + System.currentTimeMillis() + ", " +
-        startTime);
-      waitTime = this.period;
-    }
-    while (waitTime > 0) {
+    long currentSleepTime = sleepTime;
+    while (currentSleepTime > 0) {
       long woke = -1;
       try {
         synchronized (sleepLock) {
           if (triggerWake) break;
-          sleepLock.wait(waitTime);
+          sleepLock.wait(currentSleepTime);
         }
         woke = System.currentTimeMillis();
         long slept = woke - now;
@@ -108,7 +97,7 @@ public class Sleeper {
       }
       // Recalculate waitTime.
       woke = (woke == -1)? System.currentTimeMillis(): woke;
-      waitTime = this.period - (woke - startTime);
+      currentSleepTime = this.period - (woke - now);
     }
     synchronized(sleepLock) {
       triggerWake = false;

http://git-wip-us.apache.org/repos/asf/hbase/blob/fea75742/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index d7e57e8..475b0ca 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -2923,7 +2923,8 @@ public class HMaster extends HRegionServer implements 
MasterServices {
     }
   }
 
-  void checkServiceStarted() throws ServerNotRunningYetException {
+  @VisibleForTesting
+  protected void checkServiceStarted() throws ServerNotRunningYetException {
     if (!serviceStarted) {
       throw new ServerNotRunningYetException("Server is not running yet");
     }

http://git-wip-us.apache.org/repos/asf/hbase/blob/fea75742/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 1f84f9f..5080194 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -147,6 +147,8 @@ import org.apache.hadoop.hbase.util.HasThread;
 import org.apache.hadoop.hbase.util.JvmPauseMonitor;
 import org.apache.hadoop.hbase.util.NettyEventLoopGroupConfig;
 import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.util.RetryCounter;
+import org.apache.hadoop.hbase.util.RetryCounterFactory;
 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
 import org.apache.hadoop.hbase.util.Sleeper;
 import org.apache.hadoop.hbase.util.Threads;
@@ -937,14 +939,18 @@ public class HRegionServer extends HasThread implements
         this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
       }
 
-      // Try and register with the Master; tell it we are here.  Break if
-      // server is stopped or the clusterup flag is down or hdfs went wacky.
-      // Once registered successfully, go ahead and start up all Services.
+      // Try and register with the Master; tell it we are here.  Break if 
server is stopped or the
+      // clusterup flag is down or hdfs went wacky. Once registered 
successfully, go ahead and start
+      // up all Services. Use RetryCounter to get backoff in case Master is 
struggling to come up.
+      RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE,
+          this.sleeper.getPeriod(), 1000 * 60 * 5);
+      RetryCounter rc = rcf.create();
       while (keepLooping()) {
         RegionServerStartupResponse w = reportForDuty();
         if (w == null) {
-          LOG.warn("reportForDuty failed; sleeping and then retrying.");
-          this.sleeper.sleep();
+          long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
+          LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", 
sleepTime);
+          this.sleeper.sleep(sleepTime);
         } else {
           handleReportForDutyResponse(w);
           break;

http://git-wip-us.apache.org/repos/asf/hbase/blob/fea75742/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
index 83632be..aaf2d2e 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
@@ -21,6 +21,9 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
+import java.io.StringWriter;
+
+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
@@ -28,11 +31,17 @@ import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer;
 import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
+import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.master.LoadBalancer;
 import org.apache.hadoop.hbase.master.ServerManager;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
+import org.apache.log4j.Appender;
+import org.apache.log4j.Layout;
+import org.apache.log4j.PatternLayout;
+import org.apache.log4j.WriterAppender;
 import org.apache.zookeeper.KeeperException;
 import org.junit.After;
 import org.junit.Before;
@@ -78,6 +87,85 @@ public class TestRegionServerReportForDuty {
   }
 
   /**
+   * LogCapturer is similar to {@link 
org.apache.hadoop.test.GenericTestUtils.LogCapturer}
+   * except that this implementation has a default appender to the root logger.
+   * Hadoop 2.8+ supports the default appender in the LogCapture it ships and 
this can be replaced.
+   * TODO: This class can be removed after we upgrade Hadoop dependency.
+   */
+  static class LogCapturer {
+    private StringWriter sw = new StringWriter();
+    private WriterAppender appender;
+    private org.apache.log4j.Logger logger;
+
+    LogCapturer(org.apache.log4j.Logger logger) {
+      this.logger = logger;
+      Appender defaultAppender = 
org.apache.log4j.Logger.getRootLogger().getAppender("stdout");
+      if (defaultAppender == null) {
+        defaultAppender = 
org.apache.log4j.Logger.getRootLogger().getAppender("console");
+      }
+      final Layout layout = (defaultAppender == null) ? new PatternLayout() :
+          defaultAppender.getLayout();
+      this.appender = new WriterAppender(layout, sw);
+      this.logger.addAppender(this.appender);
+    }
+
+    String getOutput() {
+      return sw.toString();
+    }
+
+    public void stopCapturing() {
+      this.logger.removeAppender(this.appender);
+    }
+  }
+
+  /**
+   * This test HMaster class will always throw ServerNotRunningYetException if 
checked.
+   */
+  public static class NeverInitializedMaster extends HMaster {
+    public NeverInitializedMaster(Configuration conf) throws IOException, 
KeeperException {
+      super(conf);
+    }
+
+    @Override
+    protected void checkServiceStarted() throws ServerNotRunningYetException {
+      throw new ServerNotRunningYetException("Server is not running yet");
+    }
+  }
+
+  /**
+   * Tests region server should backoff to report for duty if master is not 
ready.
+   */
+  @Test
+  public void testReportForDutyBackoff() throws IOException, 
InterruptedException {
+    cluster.getConfiguration().set(HConstants.MASTER_IMPL, 
NeverInitializedMaster.class.getName());
+    master = cluster.addMaster();
+    master.start();
+
+    LogCapturer capturer = new 
LogCapturer(org.apache.log4j.Logger.getLogger(HRegionServer.class));
+    // Set sleep interval relatively low so that exponential backoff is more 
demanding.
+    int msginterval = 100;
+    cluster.getConfiguration().setInt("hbase.regionserver.msginterval", 
msginterval);
+    rs = cluster.addRegionServer();
+    rs.start();
+
+    int interval = 10_000;
+    Thread.sleep(interval);
+    capturer.stopCapturing();
+    String output = capturer.getOutput();
+    LOG.info("{}", output);
+    String failMsg = "reportForDuty failed;";
+    int count = StringUtils.countMatches(output, failMsg);
+
+    // Following asserts the actual retry number is in range (expectedRetry/2, 
expectedRetry*2).
+    // Ideally we can assert the exact retry count. We relax here to tolerate 
contention error.
+    int expectedRetry = (int)Math.ceil(Math.log(interval - msginterval));
+    assertTrue(String.format("reportForDuty retries %d times, less than 
expected min %d",
+        count, expectedRetry / 2), count > expectedRetry / 2);
+    assertTrue(String.format("reportForDuty retries %d times, more than 
expected max %d",
+        count, expectedRetry * 2), count < expectedRetry * 2);
+  }
+
+  /**
    * Tests region sever reportForDuty with backup master becomes primary 
master after
    * the first master goes away.
    */

Reply via email to