HBASE-15615 Wrong sleep time when RegionServerCallable need retry (Guanghao 
Zhang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/2482062d
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/2482062d
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/2482062d

Branch: refs/heads/HBASE-14850
Commit: 2482062d3475ae2ad9fc32f0f346e83892f81ad0
Parents: 6904430
Author: Mikhail Antonov <anto...@apache.org>
Authored: Sun May 15 20:37:02 2016 -0700
Committer: Mikhail Antonov <anto...@apache.org>
Committed: Sun May 15 20:37:26 2016 -0700

----------------------------------------------------------------------
 .../client/AbstractRegionServerCallable.java    |   3 +-
 .../hadoop/hbase/client/ConnectionUtils.java    |   3 +
 .../client/RegionAdminServiceCallable.java      |   2 +-
 .../hbase/client/RpcRetryingCallerImpl.java     |   4 +-
 .../hbase/client/TestConnectionUtils.java       |  20 +++
 .../org/apache/hadoop/hbase/client/TestHCM.java | 136 +++++++++++++++----
 6 files changed, 137 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java
 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java
index 4a0ea28..7279d81 100644
--- 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java
+++ 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java
@@ -100,8 +100,7 @@ abstract class AbstractRegionServerCallable<T> implements 
RetryingCallable<T> {
 
   @Override
   public long sleep(long pause, int tries) {
-    // Tries hasn't been bumped up yet so we use "tries + 1" to get right 
pause time
-    long sleep = ConnectionUtils.getPauseTime(pause, tries + 1);
+    long sleep = ConnectionUtils.getPauseTime(pause, tries);
     if (sleep < MIN_WAIT_DEAD_SERVER
         && (location == null || 
getConnection().isDeadServer(location.getServerName()))) {
       sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f);

http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java
 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java
index 82c2fc4..363a0e0 100644
--- 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java
+++ 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java
@@ -55,6 +55,9 @@ public final class ConnectionUtils {
     if (ntries >= HConstants.RETRY_BACKOFF.length) {
       ntries = HConstants.RETRY_BACKOFF.length - 1;
     }
+    if (ntries < 0) {
+      ntries = 0;
+    }
 
     long normalPause = pause * HConstants.RETRY_BACKOFF[ntries];
     long jitter =  (long)(normalPause * RANDOM.nextFloat() * 0.01f); // 1% 
possible jitter

http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java
 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java
index 8ff8b8b..725bec0 100644
--- 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java
+++ 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java
@@ -140,7 +140,7 @@ public abstract class RegionAdminServiceCallable<T> 
implements RetryingCallable<
 
   @Override
   public long sleep(long pause, int tries) {
-    long sleep = ConnectionUtils.getPauseTime(pause, tries + 1);
+    long sleep = ConnectionUtils.getPauseTime(pause, tries);
     if (sleep < MIN_WAIT_DEAD_SERVER
         && (location == null || 
connection.isDeadServer(location.getServerName()))) {
       sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f);

http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java
----------------------------------------------------------------------
diff --git 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java
 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java
index 41405ec..8f28796 100644
--- 
a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java
+++ 
b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java
@@ -124,8 +124,8 @@ public class RpcRetryingCallerImpl<T> implements 
RpcRetryingCaller<T> {
         }
         // If the server is dead, we need to wait a little before retrying, to 
give
         //  a chance to the regions to be
-        // tries hasn't been bumped up yet so we use "tries + 1" to get right 
pause time
-        expectedSleep = callable.sleep(pause, tries + 1);
+        // get right pause time, start by RETRY_BACKOFF[0] * pause
+        expectedSleep = callable.sleep(pause, tries);
 
         // If, after the planned sleep, there won't be enough time left, we 
stop now.
         long duration = singleCallDuration(expectedSleep);

http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java
index ac0a0bd..aea8a2b 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java
@@ -19,6 +19,7 @@
  */
 package org.apache.hadoop.hbase.client;
 
+import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.testclassification.ClientTests;
 import org.apache.hadoop.hbase.testclassification.SmallTests;
 import org.junit.Test;
@@ -54,4 +55,23 @@ public class TestConnectionUtils {
     assertTrue(retyTimeSet.size() > (retries.length * 0.80));
   }
 
+  @Test
+  public void testGetPauseTime() {
+    long pauseTime;
+    long baseTime = 100;
+    pauseTime = ConnectionUtils.getPauseTime(baseTime, -1);
+    assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[0]));
+    assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[0] * 1.01f));
+
+    for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
+      pauseTime = ConnectionUtils.getPauseTime(baseTime, i);
+      assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
+      assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 
1.01f));
+    }
+
+    int length = HConstants.RETRY_BACKOFF.length;
+    pauseTime = ConnectionUtils.getPauseTime(baseTime, length);
+    assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[length - 1]));
+    assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[length - 1] * 
1.01f));
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
index a0f91f4..289e6f2 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
@@ -62,6 +62,7 @@ import 
org.apache.hadoop.hbase.exceptions.RegionMovedException;
 import org.apache.hadoop.hbase.filter.Filter;
 import org.apache.hadoop.hbase.filter.FilterBase;
 import org.apache.hadoop.hbase.ipc.RpcClient;
+import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
 import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
 import org.apache.hadoop.hbase.regionserver.Region;
@@ -114,14 +115,25 @@ public class TestHCM {
 */
   public static class SleepAndFailFirstTime extends BaseRegionObserver {
     static final AtomicLong ct = new AtomicLong(0);
+    static final String SLEEP_TIME_CONF_KEY =
+        "hbase.coprocessor.SleepAndFailFirstTime.sleepTime";
+    static final long DEFAULT_SLEEP_TIME = 20000;
+    static final AtomicLong sleepTime = new AtomicLong(DEFAULT_SLEEP_TIME);
 
     public SleepAndFailFirstTime() {
     }
 
     @Override
+    public void postOpen(ObserverContext<RegionCoprocessorEnvironment> c) {
+      RegionCoprocessorEnvironment env = c.getEnvironment();
+      Configuration conf = env.getConfiguration();
+      sleepTime.set(conf.getLong(SLEEP_TIME_CONF_KEY, DEFAULT_SLEEP_TIME));
+    }
+
+    @Override
     public void preGetOp(final ObserverContext<RegionCoprocessorEnvironment> e,
               final Get get, final List<Cell> results) throws IOException {
-      Threads.sleep(20000);
+      Threads.sleep(sleepTime.get());
       if (ct.incrementAndGet() == 1){
         throw new IOException("first call I fail");
       }
@@ -352,6 +364,95 @@ public class TestHCM {
     }
   }
 
+  /**
+   * Test starting from 0 index when RpcRetryingCaller calculate the backoff 
time.
+   */
+  @Test
+  public void testRpcRetryingCallerSleep() throws Exception {
+    HTableDescriptor hdt = 
TEST_UTIL.createTableDescriptor("HCM-testRpcRetryingCallerSleep");
+    hdt.addCoprocessorWithSpec("|" + SleepAndFailFirstTime.class.getName() + 
"||"
+        + SleepAndFailFirstTime.SLEEP_TIME_CONF_KEY + "=2000");
+    TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }).close();
+
+    Configuration c = new Configuration(TEST_UTIL.getConfiguration());
+    c.setInt(HConstants.HBASE_CLIENT_PAUSE, 3000);
+    c.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 4000);
+
+    Connection connection = ConnectionFactory.createConnection(c);
+    Table t = 
connection.getTable(TableName.valueOf("HCM-testRpcRetryingCallerSleep"));
+    if (t instanceof HTable) {
+      HTable table = (HTable) t;
+      table.setOperationTimeout(8000);
+      // Check that it works. Because 2s + 3s * RETRY_BACKOFF[0] + 2s < 8s
+      table.get(new Get(FAM_NAM));
+
+      // Resetting and retrying.
+      SleepAndFailFirstTime.ct.set(0);
+      try {
+        table.setOperationTimeout(6000);
+        // Will fail this time. After sleep, there are not enough time for 
second retry
+        // Beacuse 2s + 3s + 2s > 6s
+        table.get(new Get(FAM_NAM));
+        Assert.fail("We expect an exception here");
+      } catch (SocketTimeoutException e) {
+        LOG.info("We received an exception, as expected ", e);
+      } catch (IOException e) {
+        Assert.fail("Wrong exception:" + e.getMessage());
+      } finally {
+        table.close();
+        connection.close();
+      }
+    }
+  }
+
+  @Test
+  public void testCallableSleep() throws Exception {
+    long pauseTime;
+    long baseTime = 100;
+    TableName tableName = TableName.valueOf("HCM-testCallableSleep");
+    HTable table = TEST_UTIL.createTable(tableName, FAM_NAM);
+    RegionServerCallable<Object> regionServerCallable = new 
RegionServerCallable<Object>(
+        TEST_UTIL.getConnection(), tableName, ROW) {
+      public Object call(int timeout) throws IOException {
+        return null;
+      }
+    };
+
+    regionServerCallable.prepare(false);
+    for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
+      pauseTime = regionServerCallable.sleep(baseTime, i);
+      assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
+      assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 
1.01f));
+    }
+
+    RegionAdminServiceCallable<Object> regionAdminServiceCallable =
+        new RegionAdminServiceCallable<Object>(
+        (ClusterConnection) TEST_UTIL.getConnection(), new 
RpcControllerFactory(
+            TEST_UTIL.getConfiguration()), tableName, ROW) {
+      public Object call(int timeout) throws IOException {
+        return null;
+      }
+    };
+
+    regionAdminServiceCallable.prepare(false);
+    for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
+      pauseTime = regionAdminServiceCallable.sleep(baseTime, i);
+      assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
+      assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 
1.01f));
+    }
+
+    MasterCallable masterCallable = new MasterCallable((HConnection) 
TEST_UTIL.getConnection()) {
+      public Object call(int timeout) throws IOException {
+        return null;
+      }
+    };
+
+    for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
+      pauseTime = masterCallable.sleep(baseTime, i);
+      assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i]));
+      assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 
1.01f));
+    }
+  }
 
   private void testConnectionClose(boolean allowsInterrupt) throws Exception {
     TableName tableName = TableName.valueOf("HCM-testConnectionClose" + 
allowsInterrupt);
@@ -1032,7 +1133,7 @@ public class TestHCM {
     }
   }
 
-  @Ignore ("Test presumes RETRY_BACKOFF will never change; it has") @Test
+  @Test
   public void testErrorBackoffTimeCalculation() throws Exception {
     // TODO: This test would seem to presume hardcoded RETRY_BACKOFF which it 
should not.
     final long ANY_PAUSE = 100;
@@ -1052,40 +1153,23 @@ public class TestHCM {
 
       // Check some backoff values from HConstants sequence.
       tracker.reportServerError(location);
-      assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(location, 
ANY_PAUSE));
+      assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0],
+        tracker.calculateBackoffTime(location, ANY_PAUSE));
       tracker.reportServerError(location);
       tracker.reportServerError(location);
       tracker.reportServerError(location);
-      assertEqualsWithJitter(ANY_PAUSE * 5, 
tracker.calculateBackoffTime(location, ANY_PAUSE));
+      assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[3],
+        tracker.calculateBackoffTime(location, ANY_PAUSE));
 
       // All of this shouldn't affect backoff for different location.
       assertEquals(0, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE));
       tracker.reportServerError(diffLocation);
-      assertEqualsWithJitter(ANY_PAUSE, 
tracker.calculateBackoffTime(diffLocation, ANY_PAUSE));
+      assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0],
+        tracker.calculateBackoffTime(diffLocation, ANY_PAUSE));
 
       // Check with different base.
-      assertEqualsWithJitter(ANY_PAUSE * 10,
+      assertEqualsWithJitter(ANY_PAUSE * 2 * HConstants.RETRY_BACKOFF[3],
           tracker.calculateBackoffTime(location, ANY_PAUSE * 2));
-
-      // See that time from last error is taken into account. Time shift is 
applied after jitter,
-      // so pass the original expected backoff as the base for jitter.
-      long timeShift = (long)(ANY_PAUSE * 0.5);
-      timeMachine.setValue(timeBase + timeShift);
-      assertEqualsWithJitter((ANY_PAUSE * 5) - timeShift,
-        tracker.calculateBackoffTime(location, ANY_PAUSE), ANY_PAUSE * 2);
-
-      // However we should not go into negative.
-      timeMachine.setValue(timeBase + ANY_PAUSE * 100);
-      assertEquals(0, tracker.calculateBackoffTime(location, ANY_PAUSE));
-
-      // We also should not go over the boundary; last retry would be on it.
-      long timeLeft = (long)(ANY_PAUSE * 0.5);
-      timeMachine.setValue(timeBase + largeAmountOfTime - timeLeft);
-      assertTrue(tracker.canTryMore(1));
-      tracker.reportServerError(location);
-      assertEquals(timeLeft, tracker.calculateBackoffTime(location, 
ANY_PAUSE));
-      timeMachine.setValue(timeBase + largeAmountOfTime);
-      assertFalse(tracker.canTryMore(1));
     } finally {
       EnvironmentEdgeManager.reset();
     }

Reply via email to