HBASE-20548 Master fails to startup on large clusters, refreshing block distribution
Signed-off-by: Andrew Purtell <apurt...@apache.org> Conflicts: hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/16f8aac6 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/16f8aac6 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/16f8aac6 Branch: refs/heads/branch-2.0 Commit: 16f8aac60d6365376572225ebf61f92927f04fc8 Parents: e3deb91 Author: Thiruvel Thirumoolan <thiru...@oath.com> Authored: Thu May 24 01:01:54 2018 -0700 Committer: Andrew Purtell <apurt...@apache.org> Committed: Thu May 24 15:47:23 2018 -0700 ---------------------------------------------------------------------- .../hbase/rsgroup/RSGroupBasedLoadBalancer.java | 5 ++++ .../org/apache/hadoop/hbase/master/HMaster.java | 11 +++++++++ .../hadoop/hbase/master/LoadBalancer.java | 5 ++++ .../hbase/master/balancer/BaseLoadBalancer.java | 25 +++++++++++++------- 4 files changed, 38 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/16f8aac6/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java ---------------------------------------------------------------------- diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java index 9c02bc6..a6a57e7 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java @@ -462,4 +462,9 @@ public class RSGroupBasedLoadBalancer implements RSGroupableBalancer { public void setRsGroupInfoManager(RSGroupInfoManager rsGroupInfoManager) { this.rsGroupInfoManager = rsGroupInfoManager; } + + @Override + public void postMasterStartupInitialize() { + this.internalBalancer.postMasterStartupInitialize(); + } } http://git-wip-us.apache.org/repos/asf/hbase/blob/16f8aac6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index f3871e0..d29ac70 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -971,6 +971,17 @@ public class HMaster extends HRegionServer implements MasterServices { } zombieDetector.interrupt(); + + /* + * After master has started up, lets do balancer post startup initialization. Since this runs + * in activeMasterManager thread, it should be fine. + */ + long start = System.currentTimeMillis(); + this.balancer.postMasterStartupInitialize(); + if (LOG.isDebugEnabled()) { + LOG.debug("Balancer post startup initialization complete, took " + ( + (System.currentTimeMillis() - start) / 1000) + " seconds"); + } } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/16f8aac6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java index 917da08..aa88f49 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java @@ -160,6 +160,11 @@ public interface LoadBalancer extends Configurable, Stoppable, ConfigurationObse void onConfigurationChange(Configuration conf); /** + * If balancer needs to do initialization after Master has started up, lets do that here. + */ + void postMasterStartupInitialize(); + + /** * @return true if Master carries regions */ static boolean isTablesOnMaster(Configuration conf) { http://git-wip-us.apache.org/repos/asf/hbase/blob/16f8aac6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java index 08e1ab1..98c3843 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java @@ -1151,6 +1151,19 @@ public abstract class BaseLoadBalancer implements LoadBalancer { } } + @Override + public void postMasterStartupInitialize() { + if (services != null && regionFinder != null) { + try { + Set<RegionInfo> regions = + services.getAssignmentManager().getRegionStates().getRegionAssignments().keySet(); + regionFinder.refreshAndWait(regions); + } catch (Exception e) { + LOG.warn("Refreshing region HDFS Block dist failed with exception, ignoring", e); + } + } + } + public void setRackManager(RackManager rackManager) { this.rackManager = rackManager; } @@ -1249,7 +1262,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { return assignments; } - Cluster cluster = createCluster(servers, regions, false); + Cluster cluster = createCluster(servers, regions); List<RegionInfo> unassignedRegions = new ArrayList<>(); roundRobinAssignment(cluster, regions, unassignedRegions, @@ -1288,11 +1301,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { return assignments; } - protected Cluster createCluster(List<ServerName> servers, - Collection<RegionInfo> regions, boolean forceRefresh) { - if (forceRefresh && useRegionFinder) { - regionFinder.refreshAndWait(regions); - } + protected Cluster createCluster(List<ServerName> servers, Collection<RegionInfo> regions) { // Get the snapshot of the current assignments for the regions in question, and then create // a cluster out of it. Note that we might have replicas already assigned to some servers // earlier. So we want to get the snapshot to see those assignments, but this will only contain @@ -1346,7 +1355,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { final List<ServerName> finalServers = idleServers.isEmpty() ? servers : idleServers; List<RegionInfo> regions = Lists.newArrayList(regionInfo); - Cluster cluster = createCluster(finalServers, regions, false); + Cluster cluster = createCluster(finalServers, regions); return randomAssignment(cluster, regionInfo, finalServers); } @@ -1416,7 +1425,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { int numRandomAssignments = 0; int numRetainedAssigments = 0; - Cluster cluster = createCluster(servers, regions.keySet(), true); + Cluster cluster = createCluster(servers, regions.keySet()); for (Map.Entry<RegionInfo, ServerName> entry : regions.entrySet()) { RegionInfo region = entry.getKey();