Repository: hbase Updated Branches: refs/heads/branch-1 e8c69a592 -> 9bdb88a57
HBASE-14536 Balancer & SSH interfering with each other leading to unavailability Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/9bdb88a5 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/9bdb88a5 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/9bdb88a5 Branch: refs/heads/branch-1 Commit: 9bdb88a572ac30fb51fcc44284f51543d2b4568f Parents: e8c69a5 Author: Stephen Yuan Jiang <syuanjiang...@gmail.com> Authored: Fri Oct 16 22:38:28 2015 -0700 Committer: Stephen Yuan Jiang <syuanjiang...@gmail.com> Committed: Fri Oct 16 22:38:40 2015 -0700 ---------------------------------------------------------------------- .../hadoop/hbase/master/AssignmentManager.java | 46 +++++++++++++++----- .../hadoop/hbase/master/ServerManager.java | 3 +- .../master/procedure/ServerCrashProcedure.java | 34 ++++++++++++--- .../procedure/TestServerCrashProcedure.java | 4 +- 4 files changed, 69 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 4fedbec..eef22c4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -259,6 +259,10 @@ public class AssignmentManager extends ZooKeeperListener { private RegionStateListener regionStateListener; + public enum ServerHostRegion { + NOT_HOSTING_REGION, HOSTING_REGION, UNKNOWN, + } + /** * Constructs a new assignment manager. * @@ -3371,16 +3375,16 @@ public class AssignmentManager extends ZooKeeperListener { threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo)); } - public boolean isCarryingMeta(ServerName serverName) { + public ServerHostRegion isCarryingMeta(ServerName serverName) { return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO); } - public boolean isCarryingMetaReplica(ServerName serverName, int replicaId) { + public ServerHostRegion isCarryingMetaReplica(ServerName serverName, int replicaId) { return isCarryingRegion(serverName, RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, replicaId)); } - public boolean isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri) { + public ServerHostRegion isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri) { return isCarryingRegion(serverName, metaHri); } @@ -3394,7 +3398,7 @@ public class AssignmentManager extends ZooKeeperListener { * processing hasn't finished yet when server shutdown occurs. * @return whether the serverName currently hosts the region */ - private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) { + private ServerHostRegion isCarryingRegion(ServerName serverName, HRegionInfo hri) { RegionTransition rt = null; try { byte [] data = ZKAssign.getData(watcher, hri.getEncodedName()); @@ -3412,17 +3416,37 @@ public class AssignmentManager extends ZooKeeperListener { boolean matchZK = addressFromZK.equals(serverName); LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK + " current=" + serverName + ", matches=" + matchZK); - return matchZK; + return matchZK ? ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION; } ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri); - boolean matchAM = (addressFromAM != null && - addressFromAM.equals(serverName)); - LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() + - " is on server=" + (addressFromAM != null ? addressFromAM : "null") + - " server being checked: " + serverName); + if (LOG.isDebugEnabled()) { + LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() + + " is on server=" + (addressFromAM != null ? addressFromAM : "null") + + " server being checked: " + serverName); + } + if (addressFromAM != null) { + return addressFromAM.equals(serverName) ? + ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION; + } + + if (hri.isMetaRegion() && RegionReplicaUtil.isDefaultReplica(hri)) { + // For the Meta region (default replica), we can do one more check on MetaTableLocator + final ServerName serverNameInZK = + server.getMetaTableLocator().getMetaRegionLocation(this.server.getZooKeeper()); + if (LOG.isDebugEnabled()) { + LOG.debug("Based on MetaTableLocator, the META region is on server=" + + (serverNameInZK == null ? "null" : serverNameInZK) + + " server being checked: " + serverName); + } + if (serverNameInZK != null) { + return serverNameInZK.equals(serverName) ? + ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION; + } + } - return matchAM; + // Checked everywhere, if reaching here, we are unsure whether the server is carrying region. + return ServerHostRegion.UNKNOWN; } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index a9b1f17..af6339c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -617,7 +617,8 @@ public class ServerManager { return; } - boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName); + boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName) == + AssignmentManager.ServerHostRegion.HOSTING_REGION; this.services.getMasterProcedureExecutor(). submitProcedure(new ServerCrashProcedure(serverName, true, carryingMeta)); LOG.debug("Added=" + serverName + http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index fcc95b1..6bb0262 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -313,8 +313,9 @@ implements ServerProcedureInterface { private boolean processMeta(final MasterProcedureEnv env) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName); - MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); - AssignmentManager am = env.getMasterServices().getAssignmentManager(); + MasterServices services = env.getMasterServices(); + MasterFileSystem mfs = services.getMasterFileSystem(); + AssignmentManager am = services.getAssignmentManager(); HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO; if (this.shouldSplitWal) { if (this.distributedLogReplay) { @@ -328,9 +329,31 @@ implements ServerProcedureInterface { // Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout boolean processed = true; - if (am.isCarryingMeta(serverName)) { + boolean shouldAssignMeta = false; + AssignmentManager.ServerHostRegion rsCarryingMetaRegion = am.isCarryingMeta(serverName); + switch (rsCarryingMetaRegion) { + case HOSTING_REGION: + LOG.info("Server " + serverName + " was carrying META. Trying to assign."); + am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO); + shouldAssignMeta = true; + break; + case UNKNOWN: + if (!services.getMetaTableLocator().isLocationAvailable(services.getZooKeeper())) { + // the meta location as per master is null. This could happen in case when meta + // assignment in previous run failed, while meta znode has been updated to null. + // We should try to assign the meta again. + shouldAssignMeta = true; + break; + } + // fall through + case NOT_HOSTING_REGION: + LOG.info("META has been assigned to otherwhere, skip assigning."); + break; + default: + throw new IOException("Unsupported action in MetaServerShutdownHandler"); + } + if (shouldAssignMeta) { // TODO: May block here if hard time figuring state of meta. - am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO); verifyAndAssignMetaWithRetries(env); if (this.shouldSplitWal && distributedLogReplay) { int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT); @@ -409,7 +432,8 @@ implements ServerProcedureInterface { for (int i = 1; i < replicaCount; i++) { HRegionInfo metaHri = RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, i); - if (am.isCarryingMetaReplica(this.serverName, metaHri)) { + if (am.isCarryingMetaReplica(this.serverName, metaHri) == + AssignmentManager.ServerHostRegion.HOSTING_REGION) { if (LOG.isDebugEnabled()) { LOG.debug("Reassigning meta replica" + metaHri + " that was on " + this.serverName); } http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java index 510b017..cafb0ed 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; @@ -103,7 +104,8 @@ public class TestServerCrashProcedure { master.setServerCrashProcessingEnabled(false); // Kill a server. Master will notice but do nothing other than add it to list of dead servers. HRegionServer hrs = this.util.getHBaseCluster().getRegionServer(0); - boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(hrs.getServerName()); + boolean carryingMeta = (master.getAssignmentManager().isCarryingMeta(hrs.getServerName()) == + AssignmentManager.ServerHostRegion.HOSTING_REGION); this.util.getHBaseCluster().killRegionServer(hrs.getServerName()); hrs.join(); // Wait until the expiration of the server has arrived at the master. We won't process it