HBASE-19974 Fix decommissioned servers cannot be removed by remove_servers_rsgroup methods
Signed-off-by: tedyu <yuzhih...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/a29b3caf Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/a29b3caf Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/a29b3caf Branch: refs/heads/HBASE-15151 Commit: a29b3caf4dbc7b8833474ef5da5438f7f6907e00 Parents: 2beda62 Author: haxiaolin <haxiao...@xiaomi.com> Authored: Mon Feb 26 14:25:01 2018 +0800 Committer: tedyu <yuzhih...@gmail.com> Committed: Mon Feb 26 07:30:56 2018 -0800 ---------------------------------------------------------------------- .../hbase/rsgroup/RSGroupAdminServer.java | 8 +++- .../hadoop/hbase/rsgroup/TestRSGroups.java | 7 ++- .../hadoop/hbase/rsgroup/TestRSGroupsBase.java | 46 ++++++++++++-------- 3 files changed, 40 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/a29b3caf/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java ---------------------------------------------------------------------- diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index aba57fe..094fc1d 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -675,8 +675,12 @@ public class RSGroupAdminServer implements RSGroupAdmin { private void checkForDeadOrOnlineServers(Set<Address> servers) throws ConstraintException { // This uglyness is because we only have Address, not ServerName. Set<Address> onlineServers = new HashSet<>(); - for(ServerName server: master.getServerManager().getOnlineServers().keySet()) { - onlineServers.add(server.getAddress()); + List<ServerName> drainingServers = master.getServerManager().getDrainingServersList(); + for (ServerName server : master.getServerManager().getOnlineServers().keySet()) { + // Only online but not decommissioned servers are really online + if (!drainingServers.contains(server)) { + onlineServers.add(server.getAddress()); + } } Set<Address> deadServers = new HashSet<>(); http://git-wip-us.apache.org/repos/asf/hbase/blob/a29b3caf/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroups.java ---------------------------------------------------------------------- diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroups.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroups.java index 9116f3b..610278a 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroups.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroups.java @@ -36,7 +36,6 @@ import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.Waiter.Predicate; import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; -import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.snapshot.SnapshotManager; import org.apache.hadoop.hbase.net.Address; @@ -66,7 +65,6 @@ public class TestRSGroups extends TestRSGroupsBase { HBaseClassTestRule.forClass(TestRSGroups.class); protected static final Logger LOG = LoggerFactory.getLogger(TestRSGroups.class); - private static HMaster master; private static boolean INIT = false; private static RSGroupAdminEndpoint rsGroupAdminEndpoint; @@ -126,6 +124,11 @@ public class TestRSGroups extends TestRSGroupsBase { deleteNamespaceIfNecessary(); deleteGroups(); + for(ServerName sn : admin.listDecommissionedRegionServers()){ + admin.recommissionRegionServer(sn, null); + } + assertTrue(admin.listDecommissionedRegionServers().isEmpty()); + int missing = NUM_SLAVES_BASE - getNumServers(); LOG.info("Restoring servers: "+missing); for(int i=0; i<missing; i++) { http://git-wip-us.apache.org/repos/asf/hbase/blob/a29b3caf/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java ---------------------------------------------------------------------- diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java index c3f7eef..76bcd20 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java @@ -25,8 +25,10 @@ import static org.junit.Assert.fail; import java.io.IOException; import java.security.SecureRandom; +import java.util.ArrayList; import java.util.EnumSet; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -81,9 +83,11 @@ public abstract class TestRSGroupsBase { protected static Admin admin; protected static HBaseCluster cluster; protected static RSGroupAdmin rsGroupAdmin; + protected static HMaster master; public final static long WAIT_TIMEOUT = 60000*5; public final static int NUM_SLAVES_BASE = 4; //number of slaves for the smallest cluster + public static int NUM_DEAD_SERVERS = 0; // Per test variables TableName tableName; @@ -271,10 +275,10 @@ public abstract class TestRSGroupsBase { public int getNumServers() throws IOException { ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.MASTER, Option.LIVE_SERVERS)); - ServerName master = status.getMasterName(); + ServerName masterName = status.getMasterName(); int count = 0; for (ServerName sn : status.getLiveServerMetrics().keySet()) { - if (!sn.equals(master)) { + if (!sn.equals(masterName)) { count++; } } @@ -885,6 +889,7 @@ public abstract class TestRSGroupsBase { public void testClearDeadServers() throws Exception { LOG.info("testClearDeadServers"); final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3); + NUM_DEAD_SERVERS = cluster.getClusterMetrics().getDeadServerNames().size(); ServerName targetServer = ServerName.parseServerName( newGroup.getServers().iterator().next().toString()); @@ -897,15 +902,15 @@ public abstract class TestRSGroupsBase { //due to the connection loss targetRS.stopServer(null, AdminProtos.StopServerRequest.newBuilder().setReason("Die").build()); + NUM_DEAD_SERVERS ++; } catch(Exception e) { } - HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); //wait for stopped regionserver to dead server list TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { @Override public boolean evaluate() throws Exception { return !master.getServerManager().areDeadServersInProgress() - && cluster.getClusterMetrics().getDeadServerNames().size() > 0; + && cluster.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS; } }); assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer)); @@ -925,8 +930,10 @@ public abstract class TestRSGroupsBase { public void testRemoveServers() throws Exception { LOG.info("testRemoveServers"); final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3); - ServerName targetServer = ServerName.parseServerName( - newGroup.getServers().iterator().next().toString()); + Iterator<Address> iterator = newGroup.getServers().iterator(); + ServerName targetServer = ServerName.parseServerName(iterator.next().toString()); + + // remove online servers try { rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress())); fail("Online servers shouldn't have been successfully removed."); @@ -938,6 +945,8 @@ public abstract class TestRSGroupsBase { } assertTrue(newGroup.getServers().contains(targetServer.getAddress())); + // remove dead servers + NUM_DEAD_SERVERS = cluster.getClusterMetrics().getDeadServerNames().size(); AdminProtos.AdminService.BlockingInterface targetRS = ((ClusterConnection) admin.getConnection()).getAdmin(targetServer); try { @@ -945,18 +954,19 @@ public abstract class TestRSGroupsBase { GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName()); //stopping may cause an exception //due to the connection loss + LOG.info("stopping server " + targetServer.getHostAndPort()); targetRS.stopServer(null, AdminProtos.StopServerRequest.newBuilder().setReason("Die").build()); + NUM_DEAD_SERVERS ++; } catch(Exception e) { } - HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); //wait for stopped regionserver to dead server list TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { @Override public boolean evaluate() throws Exception { return !master.getServerManager().areDeadServersInProgress() - && cluster.getClusterMetrics().getDeadServerNames().size() > 0; + && cluster.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS; } }); @@ -971,17 +981,19 @@ public abstract class TestRSGroupsBase { } assertTrue(newGroup.getServers().contains(targetServer.getAddress())); - ServerName sn = TEST_UTIL.getHBaseClusterInterface().getClusterMetrics().getMasterName(); - TEST_UTIL.getHBaseClusterInterface().stopMaster(sn); - TEST_UTIL.getHBaseClusterInterface().waitForMasterToStop(sn, 60000); - TEST_UTIL.getHBaseClusterInterface().startMaster(sn.getHostname(), 0); - TEST_UTIL.getHBaseClusterInterface().waitForActiveAndReadyMaster(60000); + // remove decommissioned servers + List<ServerName> serversToDecommission = new ArrayList<>(); + targetServer = ServerName.parseServerName(iterator.next().toString()); + targetRS = ((ClusterConnection) admin.getConnection()).getAdmin(targetServer); + targetServer = ProtobufUtil.toServerName(targetRS.getServerInfo(null, + GetServerInfoRequest.newBuilder().build()).getServerInfo().getServerName()); + assertTrue(master.getServerManager().getOnlineServers().containsKey(targetServer)); + serversToDecommission.add(targetServer); - assertEquals(3, cluster.getClusterMetrics().getLiveServerMetrics().size()); - assertFalse(cluster.getClusterMetrics().getLiveServerMetrics().containsKey(targetServer)); - assertFalse(cluster.getClusterMetrics().getDeadServerNames().contains(targetServer)); - assertTrue(newGroup.getServers().contains(targetServer.getAddress())); + admin.decommissionRegionServers(serversToDecommission, true); + assertEquals(1, admin.listDecommissionedRegionServers().size()); + assertTrue(newGroup.getServers().contains(targetServer.getAddress())); rsGroupAdmin.removeServers(Sets.newHashSet(targetServer.getAddress())); Set<Address> newGroupServers = rsGroupAdmin.getRSGroupInfo(newGroup.getName()).getServers(); assertFalse(newGroupServers.contains(targetServer.getAddress()));