This is an automated email from the ASF dual-hosted git repository.
zghao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/master by this push:
new 942f8c4 HBASE-22193 Add backoff when region failed open too many times
new 249ac58 Merge pull request #133 from infraio/retry-backoff
942f8c4 is described below
commit 942f8c45cd1a1e0a8956fc10b811dd2add510645
Author: Guanghao Zhang <[email protected]>
AuthorDate: Tue Apr 9 18:17:03 2019 +0800
HBASE-22193 Add backoff when region failed open too many times
---
.../hbase/master/assignment/AssignmentManager.java | 11 +++++++++
.../assignment/TransitRegionStateProcedure.java | 26 +++++++++++++---------
2 files changed, 26 insertions(+), 11 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 1aea8f9..5bdbb92 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -131,6 +131,10 @@ public class AssignmentManager {
"hbase.assignment.maximum.attempts";
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
+ public static final String ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS =
+ "hbase.assignment.retry.immediately.maximum.attempts";
+ private static final int DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = 3;
+
/** Region in Transition metrics threshold time */
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
"hbase.metrics.rit.stuck.warning.threshold";
@@ -151,6 +155,7 @@ public class AssignmentManager {
private final int assignDispatchWaitQueueMaxSize;
private final int assignDispatchWaitMillis;
private final int assignMaxAttempts;
+ private final int assignRetryImmediatelyMaxAttempts;
private final Object checkIfShouldMoveSystemRegionLock = new Object();
@@ -179,6 +184,8 @@ public class AssignmentManager {
this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
DEFAULT_ASSIGN_MAX_ATTEMPTS));
+ this.assignRetryImmediatelyMaxAttempts =
conf.getInt(ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS,
+ DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS);
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
@@ -308,6 +315,10 @@ public class AssignmentManager {
return assignMaxAttempts;
}
+ int getAssignRetryImmediatelyMaxAttempts() {
+ return assignRetryImmediatelyMaxAttempts;
+ }
+
public RegionStates getRegionStates() {
return regionStates;
}
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index 1be7a9a..716db69 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -226,20 +226,32 @@ public class TransitRegionStateProcedure
return Flow.HAS_MORE_STATE;
}
- if (incrementAndCheckMaxAttempts(env, regionNode)) {
+ int retries =
env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
+ .incrementAndGetRetries();
+ int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
+ LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this,
regionNode.toShortString());
+
+ if (retries >= maxAttempts) {
env.getAssignmentManager().regionFailedOpen(regionNode, true);
setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() +
" exceeded"));
regionNode.unsetProcedure(this);
return Flow.NO_MORE_STATE;
}
+
env.getAssignmentManager().regionFailedOpen(regionNode, false);
// we failed to assign the region, force a new plan
forceNewPlan = true;
regionNode.setRegionLocation(null);
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
- // Here we do not throw exception because we want to the region to be
online ASAP
- return Flow.HAS_MORE_STATE;
+
+ if (retries >
env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
+ // Throw exception to backoff and retry when failed open too many times
+ throw new HBaseIOException("Failed to open region");
+ } else {
+ // Here we do not throw exception because we want to the region to be
online ASAP
+ return Flow.HAS_MORE_STATE;
+ }
}
private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode)
throws IOException {
@@ -400,14 +412,6 @@ public class TransitRegionStateProcedure
this.remoteProc = null;
}
- private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env,
RegionStateNode regionNode) {
- int retries =
env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
- .incrementAndGetRetries();
- int max = env.getAssignmentManager().getAssignMaxAttempts();
- LOG.info("Retry={} of max={}; {}; {}", retries, max, this,
regionNode.toShortString());
- return retries >= max;
- }
-
@Override
protected void rollbackState(MasterProcedureEnv env,
RegionStateTransitionState state)
throws IOException, InterruptedException {