Repository: hbase Updated Branches: refs/heads/master 2b1cbeb28 -> fe4438361
HBASE-16209 provide an ExponentialBackOffPolicy sleep between failed region open requests Signed-off-by: Elliott Clark <ecl...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/fe443836 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/fe443836 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/fe443836 Branch: refs/heads/master Commit: fe44383619d0d11396382f2bfa493ad93c8b0e5f Parents: 2b1cbeb Author: Joseph Hwang <j...@fb.com> Authored: Tue Jul 26 11:13:32 2016 -0700 Committer: Elliott Clark <ecl...@apache.org> Committed: Fri Jul 29 09:47:00 2016 -0700 ---------------------------------------------------------------------- .../master/AssignmentManagerStatusTmpl.jamon | 20 ++++++- .../hadoop/hbase/master/AssignmentManager.java | 56 ++++++++++++++++++-- .../hadoop/hbase/master/RegionStates.java | 8 ++- .../hbase/master/TestMasterStatusServlet.java | 8 ++- 4 files changed, 84 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/fe443836/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index e2ae09d..06b320f 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -25,6 +25,8 @@ org.apache.hadoop.hbase.HBaseConfiguration; org.apache.hadoop.hbase.HConstants; java.util.HashSet; java.util.SortedSet; +java.util.Map; +java.util.concurrent.atomic.AtomicInteger; </%import> <%args> AssignmentManager assignmentManager; @@ -32,7 +34,9 @@ int limit = 100; </%args> <%java SortedSet<RegionState> rit = assignmentManager - .getRegionStates().getRegionsInTransitionOrderedByTimestamp(); %> + .getRegionStates().getRegionsInTransitionOrderedByTimestamp(); + Map<String, AtomicInteger> failedRegionTracker = assignmentManager.getFailedOpenTracker(); + %> <%if !rit.isEmpty() %> <%java> @@ -83,7 +87,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); <div class="tab-pane" id="tab_rits<% (recordItr / ritsPerPage) + 1 %>"> </%if> <table class="table table-striped" style="margin-bottom:0px;"><tr><th>Region</th> - <th>State</th><th>RIT time (ms)</th></tr> + <th>State</th><th>RIT time (ms)</th> <th>Retries </th></tr> </%if> <%if ritsOverThreshold.contains(rs.getRegion().getEncodedName()) %> @@ -93,9 +97,21 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); <%else> <tr> </%if> + <%java> + String retryStatus = "0"; + String name = rs.getRegion().getEncodedName(); + RegionState state = assignmentManager.getState(name); + AtomicInteger numOpenRetries = failedRegionTracker.get(name); + if (numOpenRetries != null ) { + retryStatus = Integer.toString(numOpenRetries.get()); + } else if (state.getState() == RegionState.State.FAILED_OPEN) { + retryStatus = "Failed"; + } + </%java> <td><% rs.getRegion().getEncodedName() %></td><td> <% HRegionInfo.getDescriptiveNameFromRegionStateForDisplay(rs, conf) %></td> <td><% (currentTime - rs.getStamp()) %> </td> + <td> <% retryStatus %> </td> </tr> <%java recordItr++; %> <%if (recordItr % ritsPerPage) == 0 %> http://git-wip-us.apache.org/repos/asf/hbase/blob/fe443836/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 2ffe466..ffdbac8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -36,6 +36,7 @@ import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; @@ -87,6 +88,7 @@ import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.KeyLocker; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.PairOfSameType; +import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; @@ -149,8 +151,8 @@ public class AssignmentManager { private final ExecutorService executorService; - // Thread pool executor service. TODO, consolidate with executorService? private java.util.concurrent.ExecutorService threadPoolExecutorService; + private ScheduledThreadPoolExecutor scheduledThreadPoolExecutor; private final RegionStates regionStates; @@ -202,6 +204,8 @@ public class AssignmentManager { private RegionStateListener regionStateListener; + private RetryCounter.BackoffPolicy backoffPolicy; + private RetryCounter.RetryConfig retryConfig; /** * Constructs a new assignment manager. * @@ -240,8 +244,13 @@ public class AssignmentManager { "hbase.meta.assignment.retry.sleeptime", 1000l); this.balancer = balancer; int maxThreads = conf.getInt("hbase.assignment.threads.max", 30); + this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool( - maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM.")); + maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM.")); + + this.scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(1, + Threads.newDaemonThreadFactory("AM.Scheduler")); + this.regionStates = new RegionStates( server, tableStateManager, serverManager, regionStateStore); @@ -254,6 +263,23 @@ public class AssignmentManager { this.metricsAssignmentManager = new MetricsAssignmentManager(); this.tableLockManager = tableLockManager; + + // Configurations for retrying opening a region on receiving a FAILED_OPEN + this.retryConfig = new RetryCounter.RetryConfig(); + this.retryConfig.setSleepInterval(conf.getLong("hbase.assignment.retry.sleep.initial", 0l)); + // Set the max time limit to the initial sleep interval so we use a constant time sleep strategy + // if the user does not set a max sleep limit + this.retryConfig.setMaxSleepTime(conf.getLong("hbase.assignment.retry.sleep.max", + retryConfig.getSleepInterval())); + this.backoffPolicy = getBackoffPolicy(); + } + + /** + * Returns the backoff policy used for Failed Region Open retries + * @return the backoff policy used for Failed Region Open retries + */ + RetryCounter.BackoffPolicy getBackoffPolicy() { + return new RetryCounter.ExponentialBackoffPolicyWithLimit(); } MetricsAssignmentManager getAssignmentManagerMetrics() { @@ -2028,6 +2054,11 @@ public class AssignmentManager { threadPoolExecutorService.submit(new AssignCallable(this, regionInfo)); } + void invokeAssignLater(HRegionInfo regionInfo, long sleepMillis) { + scheduledThreadPoolExecutor.schedule(new DelayedAssignCallable( + new AssignCallable(this, regionInfo)), sleepMillis, TimeUnit.MILLISECONDS); + } + void invokeUnAssign(HRegionInfo regionInfo) { threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo)); } @@ -2233,7 +2264,10 @@ public class AssignmentManager { } catch (HBaseIOException e) { LOG.warn("Failed to get region plan", e); } - invokeAssign(hri); + // Have the current thread sleep a bit before resubmitting the RPC request + long sleepTime = backoffPolicy.getBackoffTime(retryConfig, + failedOpenTracker.get(encodedName).get()); + invokeAssignLater(hri, sleepTime); } } // Null means no error @@ -2732,6 +2766,10 @@ public class AssignmentManager { return replicasToClose; } + public Map<String, AtomicInteger> getFailedOpenTracker() {return failedOpenTracker;} + + public RegionState getState(String encodedName) {return regionStates.getRegionState(encodedName);} + /** * A region is offline. The new state should be the specified one, * if not null. If the specified state is null, the new state is Offline. @@ -2934,4 +2972,16 @@ public class AssignmentManager { void setRegionStateListener(RegionStateListener listener) { this.regionStateListener = listener; } + + private class DelayedAssignCallable implements Runnable { + Callable callable; + public DelayedAssignCallable(Callable callable) { + this.callable = callable; + } + + @Override + public void run() { + threadPoolExecutorService.submit(callable); + } + } } http://git-wip-us.apache.org/repos/asf/hbase/blob/fe443836/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index b95b894..ba08a05 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -67,10 +67,16 @@ public class RegionStates { public final static RegionStateStampComparator REGION_STATE_COMPARATOR = new RegionStateStampComparator(); + + // This comparator sorts the RegionStates by time stamp then Region name. + // Comparing by timestamp alone can lead us to discard different RegionStates that happen + // to share a timestamp. private static class RegionStateStampComparator implements Comparator<RegionState> { @Override public int compare(RegionState l, RegionState r) { - return Long.compare(l.getStamp(), r.getStamp()); + return Long.compare(l.getStamp(), r.getStamp()) == 0 ? + Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) : + Long.compare(l.getStamp(), r.getStamp()); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/fe443836/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java index 02f01c4..f975d9d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterStatusServlet.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master; import static org.junit.Assert.*; +import static org.mockito.Matchers.any; import java.io.IOException; import java.io.StringWriter; @@ -26,8 +27,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; @@ -178,6 +177,11 @@ public class TestMasterStatusServlet { Mockito.doReturn(rs).when(am).getRegionStates(); Mockito.doReturn(regionsInTransition).when(rs).getRegionsInTransition(); Mockito.doReturn(regionsInTransition).when(rs).getRegionsInTransitionOrderedByTimestamp(); + Mockito.when( + am.getState( + any(String.class) + ) + ).thenReturn(new RegionState(null, null)); // Render to a string StringWriter sw = new StringWriter();