This is an automated email from the ASF dual-hosted git repository.

zghao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/master by this push:
     new 942f8c4  HBASE-22193 Add backoff when region failed open too many times
     new 249ac58  Merge pull request #133 from infraio/retry-backoff
942f8c4 is described below

commit 942f8c45cd1a1e0a8956fc10b811dd2add510645
Author: Guanghao Zhang <[email protected]>
AuthorDate: Tue Apr 9 18:17:03 2019 +0800

    HBASE-22193 Add backoff when region failed open too many times
---
 .../hbase/master/assignment/AssignmentManager.java | 11 +++++++++
 .../assignment/TransitRegionStateProcedure.java    | 26 +++++++++++++---------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 1aea8f9..5bdbb92 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -131,6 +131,10 @@ public class AssignmentManager {
       "hbase.assignment.maximum.attempts";
   private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
 
+  public static final String ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS =
+      "hbase.assignment.retry.immediately.maximum.attempts";
+  private static final int DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = 3;
+
   /** Region in Transition metrics threshold time */
   public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
       "hbase.metrics.rit.stuck.warning.threshold";
@@ -151,6 +155,7 @@ public class AssignmentManager {
   private final int assignDispatchWaitQueueMaxSize;
   private final int assignDispatchWaitMillis;
   private final int assignMaxAttempts;
+  private final int assignRetryImmediatelyMaxAttempts;
 
   private final Object checkIfShouldMoveSystemRegionLock = new Object();
 
@@ -179,6 +184,8 @@ public class AssignmentManager {
 
     this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
         DEFAULT_ASSIGN_MAX_ATTEMPTS));
+    this.assignRetryImmediatelyMaxAttempts = 
conf.getInt(ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS,
+        DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS);
 
     int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
         DEFAULT_RIT_CHORE_INTERVAL_MSEC);
@@ -308,6 +315,10 @@ public class AssignmentManager {
     return assignMaxAttempts;
   }
 
+  int getAssignRetryImmediatelyMaxAttempts() {
+    return assignRetryImmediatelyMaxAttempts;
+  }
+
   public RegionStates getRegionStates() {
     return regionStates;
   }
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index 1be7a9a..716db69 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -226,20 +226,32 @@ public class TransitRegionStateProcedure
       return Flow.HAS_MORE_STATE;
     }
 
-    if (incrementAndCheckMaxAttempts(env, regionNode)) {
+    int retries = 
env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
+        .incrementAndGetRetries();
+    int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
+    LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, 
regionNode.toShortString());
+
+    if (retries >= maxAttempts) {
       env.getAssignmentManager().regionFailedOpen(regionNode, true);
       setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
         "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + 
" exceeded"));
       regionNode.unsetProcedure(this);
       return Flow.NO_MORE_STATE;
     }
+
     env.getAssignmentManager().regionFailedOpen(regionNode, false);
     // we failed to assign the region, force a new plan
     forceNewPlan = true;
     regionNode.setRegionLocation(null);
     
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
-    // Here we do not throw exception because we want to the region to be 
online ASAP
-    return Flow.HAS_MORE_STATE;
+
+    if (retries > 
env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
+      // Throw exception to backoff and retry when failed open too many times
+      throw new HBaseIOException("Failed to open region");
+    } else {
+      // Here we do not throw exception because we want to the region to be 
online ASAP
+      return Flow.HAS_MORE_STATE;
+    }
   }
 
   private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) 
throws IOException {
@@ -400,14 +412,6 @@ public class TransitRegionStateProcedure
     this.remoteProc = null;
   }
 
-  private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, 
RegionStateNode regionNode) {
-    int retries = 
env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
-      .incrementAndGetRetries();
-    int max = env.getAssignmentManager().getAssignMaxAttempts();
-    LOG.info("Retry={} of max={}; {}; {}", retries, max, this, 
regionNode.toShortString());
-    return retries >= max;
-  }
-
   @Override
   protected void rollbackState(MasterProcedureEnv env, 
RegionStateTransitionState state)
       throws IOException, InterruptedException {

Reply via email to