HBASE-18625 Splitting of region with replica, doesn't update region list in serverHolding. A server crash leads to overlap.
Signed-off-by: ramkrishna.s.vasudevan<ramkrishna.s.vasude...@intel.com> (cherry picked from commit 80d183a707c6d0203da1565f6a7ca3edf4bb2291) Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/4b795553 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/4b795553 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/4b795553 Branch: refs/heads/branch-1.2 Commit: 4b795553ea5a05d16088db08c9cc4994e354d56c Parents: 688d4de Author: huaxiang sun <huaxiang...@apache.org> Authored: Wed Jan 3 12:24:05 2018 -0800 Committer: Sean Busbey <bus...@apache.org> Committed: Mon Feb 26 10:46:54 2018 -0600 ---------------------------------------------------------------------- .../hadoop/hbase/master/RegionStates.java | 33 ++++++- .../TestCatalogJanitorInMemoryStates.java | 91 +++++++++++++++++--- 2 files changed, 112 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/4b795553/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index cf68a02..3b89f7b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -545,6 +545,19 @@ public class RegionStates { } /** + * Used in some unit tests + */ + @VisibleForTesting + synchronized boolean existsInServerHoldings(final ServerName serverName, + final HRegionInfo hri) { + Set<HRegionInfo> oldRegions = serverHoldings.get(serverName); + if (oldRegions != null) { + return oldRegions.contains(hri); + } + return false; + } + + /** * A dead server's wals have been split so that all the regions * used to be open on it can be safely assigned now. Mark them assignable. */ @@ -613,8 +626,26 @@ public class RegionStates { deleteRegion(hri); return; } + + /* + * One tricky case, if region here is a replica region and its parent is at + * SPLIT state, its newState should be same as its parent, not OFFLINE. + */ State newState = - expectedState == null ? State.OFFLINE : expectedState; + expectedState == null ? State.OFFLINE : expectedState; + + if ((expectedState == null) && !RegionReplicaUtil.isDefaultReplica(hri)) { + RegionState primateState = getRegionState( + RegionReplicaUtil.getRegionInfoForDefaultReplica(hri)); + if ((primateState != null) && (primateState.getState() == State.SPLIT)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Update region " + hri + "to SPLIT, from primary region " + + RegionReplicaUtil.getRegionInfoForDefaultReplica(hri)); + } + newState = State.SPLIT; + } + } + updateRegionState(hri, newState); String encodedName = hri.getEncodedName(); synchronized (this) { http://git-wip-us.apache.org/repos/asf/hbase/blob/4b795553/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitorInMemoryStates.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitorInMemoryStates.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitorInMemoryStates.java index d2bed9b..6ea7e2f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitorInMemoryStates.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitorInMemoryStates.java @@ -18,21 +18,20 @@ */ package org.apache.hadoop.hbase.master; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.*; -import org.apache.hadoop.hbase.master.AssignmentManager; -import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; -import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.PairOfSameType; import org.apache.hadoop.hbase.util.Threads; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.junit.AfterClass; @@ -44,12 +43,6 @@ import org.junit.experimental.categories.Category; import org.junit.rules.TestName; import org.junit.rules.TestRule; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertNotNull; @Category({MasterTests.class, MediumTests.class}) @@ -126,6 +119,57 @@ public class TestCatalogJanitorInMemoryStates { } + /** + * Test that after replica parent region is split, the parent replica region is removed from + * AM's serverHoldings and + */ + @Test(timeout = 180000) + public void testInMemoryForReplicaParentCleanup() throws IOException, InterruptedException { + final AssignmentManager am = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager(); + final CatalogJanitor janitor = TEST_UTIL.getHBaseCluster().getMaster().catalogJanitorChore; + + final TableName tableName = TableName.valueOf("testInMemoryForReplicaParentCleanup"); + HTableDescriptor hdt = TEST_UTIL.createTableDescriptor(tableName.getNameAsString()); + hdt.setRegionReplication(2); + TEST_UTIL.createTable(hdt, new byte[][] { FAMILY }, TEST_UTIL.getConfiguration()); + + RegionLocator locator = TEST_UTIL.getConnection().getRegionLocator(tableName); + List<HRegionLocation> allRegionLocations = locator.getAllRegionLocations(); + + // There are two regions, one for primary, one for the replica. + assertTrue(allRegionLocations.size() == 2); + + HRegionLocation replicaParentRegion, primaryParentRegion; + if (RegionReplicaUtil.isDefaultReplica( + allRegionLocations.get(0).getRegionInfo().getReplicaId())) { + primaryParentRegion = allRegionLocations.get(0); + replicaParentRegion = allRegionLocations.get(1); + } else { + primaryParentRegion = allRegionLocations.get(1); + replicaParentRegion = allRegionLocations.get(0); + } + + List<HRegionLocation> primaryDaughters = splitRegion(primaryParentRegion.getRegionInfo(), + Bytes.toBytes("a")); + + // Wait until the replica parent region is offline. + while (am.getRegionStates().isRegionOnline(replicaParentRegion.getRegionInfo())) { + Thread.sleep(100); + } + + assertNotNull("Should have found daughter regions for " + primaryDaughters, primaryDaughters); + + // check that primary parent region is not in AM's serverHoldings + assertFalse("Primary Parent region should have been removed from RegionState's serverHoldings", + am.getRegionStates().existsInServerHoldings(primaryParentRegion.getServerName(), + primaryParentRegion.getRegionInfo())); + + // check that primary parent region is not in AM's serverHoldings + assertFalse("Primary Parent region should have been removed from RegionState's serverHoldings", + am.getRegionStates().existsInServerHoldings(replicaParentRegion.getServerName(), + replicaParentRegion.getRegionInfo())); + } + /* * Splits a region * @param t Region to split. @@ -152,6 +196,31 @@ public class TestCatalogJanitorInMemoryStates { } /* +* Splits a region +* @param t Region to split. +* @return List of region locations +* @throws IOException, InterruptedException +*/ + private List<HRegionLocation> splitRegion(final HRegionInfo r, final byte[] splitPoint) + throws IOException, InterruptedException { + List<HRegionLocation> locations = new ArrayList<>(); + // Split this table in two. + Admin admin = TEST_UTIL.getHBaseAdmin(); + Connection connection = TEST_UTIL.getConnection(); + admin.splitRegion(r.getEncodedNameAsBytes(), splitPoint); + admin.close(); + PairOfSameType<HRegionInfo> regions = waitOnDaughters(r); + if (regions != null) { + try (RegionLocator rl = connection.getRegionLocator(r.getTable())) { + locations.add(rl.getRegionLocation(regions.getFirst().getEncodedNameAsBytes())); + locations.add(rl.getRegionLocation(regions.getSecond().getEncodedNameAsBytes())); + } + return locations; + } + return locations; + } + + /* * Wait on region split. May return because we waited long enough on the split * and it didn't happen. Caller should check. * @param r