Repository: hbase
Updated Branches:
  refs/heads/branch-1.2 d46e9613b -> c368c8587


HBASE-15941 HBCK repair should not unsplit healthy splitted region

Signed-off-by: Michael Stack <st...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/c368c858
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/c368c858
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/c368c858

Branch: refs/heads/branch-1.2
Commit: c368c8587a85261ac9eefe56a2d5f30ee8fe3f33
Parents: d46e961
Author: Esteban Gutierrez <este...@apache.org>
Authored: Tue Mar 7 01:00:48 2017 -0800
Committer: Esteban Gutierrez <este...@apache.org>
Committed: Wed Mar 8 22:34:52 2017 -0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/util/HBaseFsck.java | 154 ++++++++++++++++++-
 .../hadoop/hbase/util/HBaseFsckRepair.java      |   8 +
 .../util/hbck/TableIntegrityErrorHandler.java   |   8 +
 .../apache/hadoop/hbase/util/TestHBaseFsck.java | 138 +++++++++++++++++
 4 files changed, 305 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/c368c858/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
index c1778f7..0fc4d15 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
@@ -245,6 +245,7 @@ public class HBaseFsck extends Configured implements 
Closeable {
   private boolean fixTableOrphans = false; // fix fs holes (missing .tableinfo)
   private boolean fixVersionFile = false; // fix missing hbase.version file in 
hdfs
   private boolean fixSplitParents = false; // fix lingering split parents
+  private boolean removeParents = false; // remove split parents
   private boolean fixReferenceFiles = false; // fix lingering reference store 
file
   private boolean fixEmptyMetaCells = false; // fix (remove) empty 
REGIONINFO_QUALIFIER rows
   private boolean fixTableLocks = false; // fix table locks which are expired
@@ -1044,6 +1045,8 @@ public class HBaseFsck extends Configured implements 
Closeable {
         setShouldRerun();
 
         success = fs.rename(path, dst);
+        debugLsr(dst);
+
       }
       if (!success) {
         LOG.error("Failed to sideline reference file " + path);
@@ -2287,7 +2290,8 @@ public class HBaseFsck extends Configured implements 
Closeable {
       }
       errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
           + descriptiveName + " is a split parent in META, in HDFS, "
-          + "and not deployed on any region server. This could be transient.");
+          + "and not deployed on any region server. This could be transient, "
+          + "consider to run the catalog janitor first!");
       if (shouldFixSplitParents()) {
         setShouldRerun();
         resetSplitParent(hbi);
@@ -2685,6 +2689,18 @@ public class HBaseFsck extends Configured implements 
Closeable {
       }
 
       @Override
+      public void handleSplit(HbckInfo r1, HbckInfo r2) throws IOException{
+        byte[] key = r1.getStartKey();
+        // dup start key
+        errors.reportError(ERROR_CODE.DUPE_ENDKEYS,
+          "Multiple regions have the same regionID: "
+            + Bytes.toStringBinary(key), getTableInfo(), r1);
+        errors.reportError(ERROR_CODE.DUPE_ENDKEYS,
+          "Multiple regions have the same regionID: "
+            + Bytes.toStringBinary(key), getTableInfo(), r2);
+      }
+
+      @Override
       public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) 
throws IOException{
         errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
             "There is an overlap in the region chain.",
@@ -2818,10 +2834,124 @@ public class HBaseFsck extends Configured implements 
Closeable {
           }
           return;
         }
-
+        if (shouldRemoveParents()) {
+          removeParentsAndFixSplits(overlap);
+        }
         mergeOverlaps(overlap);
       }
 
+      void removeParentsAndFixSplits(Collection<HbckInfo> overlap) throws 
IOException {
+        Pair<byte[], byte[]> range = null;
+        HbckInfo parent = null;
+        HbckInfo daughterA = null;
+        HbckInfo daughterB = null;
+        Collection<HbckInfo> daughters = new ArrayList<HbckInfo>(overlap);
+
+        String thread = Thread.currentThread().getName();
+        LOG.info("== [" + thread + "] Attempting fix splits in overlap 
state.");
+
+        // we only can handle a single split per group at the time
+        if (overlap.size() > 3) {
+          LOG.info("Too many overlaps were found on this group, falling back 
to regular merge.");
+          return;
+        }
+
+        for (HbckInfo hi : overlap) {
+          if (range == null) {
+            range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
+          } else {
+            if (RegionSplitCalculator.BYTES_COMPARATOR
+              .compare(hi.getStartKey(), range.getFirst()) < 0) {
+              range.setFirst(hi.getStartKey());
+            }
+            if (RegionSplitCalculator.BYTES_COMPARATOR
+              .compare(hi.getEndKey(), range.getSecond()) > 0) {
+              range.setSecond(hi.getEndKey());
+            }
+          }
+        }
+
+        LOG.info("This group range is [" + 
Bytes.toStringBinary(range.getFirst()) + ", "
+          + Bytes.toStringBinary(range.getSecond()) + "]");
+
+        // attempt to find a possible parent for the edge case of a split
+        for (HbckInfo hi : overlap) {
+          if (Bytes.compareTo(hi.getHdfsHRI().getStartKey(), range.getFirst()) 
== 0
+            && Bytes.compareTo(hi.getHdfsHRI().getEndKey(), range.getSecond()) 
== 0) {
+            LOG.info("This is a parent for this group: " + hi.toString());
+            parent = hi;
+          }
+        }
+
+        // Remove parent regions from daughters collection
+        if (parent != null) {
+          daughters.remove(parent);
+        }
+
+        // Lets verify that daughters share the regionID at split time and they
+        // were created after the parent
+        for (HbckInfo hi : daughters) {
+          if (Bytes.compareTo(hi.getHdfsHRI().getStartKey(), range.getFirst()) 
== 0) {
+            if (parent.getHdfsHRI().getRegionId() < 
hi.getHdfsHRI().getRegionId()) {
+              daughterA = hi;
+            }
+          }
+          if (Bytes.compareTo(hi.getHdfsHRI().getEndKey(), range.getSecond()) 
== 0) {
+            if (parent.getHdfsHRI().getRegionId() < 
hi.getHdfsHRI().getRegionId()) {
+              daughterB = hi;
+            }
+          }
+        }
+
+        // daughters must share the same regionID and we should have a parent 
too
+        if (daughterA.getHdfsHRI().getRegionId() != 
daughterB.getHdfsHRI().getRegionId() || parent == null)
+          return;
+
+        FileSystem fs = FileSystem.get(conf);
+        LOG.info("Found parent: " + parent.getRegionNameAsString());
+        LOG.info("Found potential daughter a: " + 
daughterA.getRegionNameAsString());
+        LOG.info("Found potential daughter b: " + 
daughterB.getRegionNameAsString());
+        LOG.info("Trying to fix parent in overlap by removing the parent.");
+        try {
+          closeRegion(parent);
+        } catch (IOException ioe) {
+          LOG.warn("Parent region could not be closed, continuing with regular 
merge...", ioe);
+          return;
+        } catch (InterruptedException ie) {
+          LOG.warn("Parent region could not be closed, continuing with regular 
merge...", ie);
+          return;
+        }
+
+        try {
+          offline(parent.getRegionName());
+        } catch (IOException ioe) {
+          LOG.warn("Unable to offline parent region: " + 
parent.getRegionNameAsString()
+            + ".  Just continuing with regular merge... ", ioe);
+          return;
+        }
+
+        try {
+          HBaseFsckRepair.removeParentInMeta(conf, parent.getHdfsHRI());
+        } catch (IOException ioe) {
+          LOG.warn("Unable to remove parent region in META: " + 
parent.getRegionNameAsString()
+            + ".  Just continuing with regular merge... ", ioe);
+          return;
+        }
+
+        sidelineRegionDir(fs, parent);
+        LOG.info("[" + thread + "] Sidelined parent region dir "+ 
parent.getHdfsRegionDir() + " into " +
+          getSidelineDir());
+        debugLsr(parent.getHdfsRegionDir());
+
+        // Make sure we don't have the parents and daughters around
+        overlap.remove(parent);
+        overlap.remove(daughterA);
+        overlap.remove(daughterB);
+
+        LOG.info("Done fixing split.");
+
+      }
+
       void mergeOverlaps(Collection<HbckInfo> overlap)
           throws IOException {
         String thread = Thread.currentThread().getName();
@@ -3007,8 +3137,13 @@ public class HBaseFsck extends Configured implements 
Closeable {
             subRange.remove(r1);
             for (HbckInfo r2 : subRange) {
               if (r2.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) 
continue;
+              // general case of same start key
               if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
                 handler.handleDuplicateStartKeys(r1,r2);
+              } else if (Bytes.compareTo(r1.getEndKey(), r2.getStartKey())==0 
&&
+                r1.getHdfsHRI().getRegionId() == 
r2.getHdfsHRI().getRegionId()) {
+                LOG.info("this is a split, log to splits");
+                handler.handleSplit(r1, r2);
               } else {
                 // overlap
                 handler.handleOverlapInRegionChain(r1, r2);
@@ -3818,7 +3953,8 @@ public class HBaseFsck extends Configured implements 
Closeable {
       FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, 
DUPE_STARTKEYS,
       HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, 
DEGENERATE_REGION,
       ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, 
LINGERING_REFERENCE_HFILE,
-      WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, 
ORPHANED_ZK_TABLE_ENTRY, BOUNDARIES_ERROR
+      WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, 
ORPHANED_ZK_TABLE_ENTRY, BOUNDARIES_ERROR,
+      DUPE_ENDKEYS
     }
     void clear();
     void report(String message);
@@ -4344,10 +4480,19 @@ public class HBaseFsck extends Configured implements 
Closeable {
     fixAny |= shouldFix;
   }
 
+  public void setRemoveParents(boolean shouldFix) {
+    removeParents = shouldFix;
+    fixAny |= shouldFix;
+  }
+
   boolean shouldFixSplitParents() {
     return fixSplitParents;
   }
 
+  boolean shouldRemoveParents() {
+    return removeParents;
+  }
+
   public void setFixReferenceFiles(boolean shouldFix) {
     fixReferenceFiles = shouldFix;
     fixAny |= shouldFix;
@@ -4472,6 +4617,7 @@ public class HBaseFsck extends Configured implements 
Closeable {
     out.println("   -sidelineBigOverlaps  When fixing region overlaps, allow 
to sideline big overlaps");
     out.println("   -maxOverlapsToSideline <n>  When fixing region overlaps, 
allow at most <n> regions to sideline per group. (n=" + 
DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
     out.println("   -fixSplitParents  Try to force offline split parents to be 
online.");
+    out.println("   -removeParents    Try to offline and sideline lingering 
parents and keep daughter regions.");
     out.println("   -ignorePreCheckPermission  ignore filesystem permission 
pre-check");
     out.println("   -fixReferenceFiles  Try to offline lingering reference 
store files");
     out.println("   -fixEmptyMetaCells  Try to fix hbase:meta entries not 
referencing any region"
@@ -4610,6 +4756,8 @@ public class HBaseFsck extends Configured implements 
Closeable {
         setSidelineBigOverlaps(true);
       } else if (cmd.equals("-fixSplitParents")) {
         setFixSplitParents(true);
+      } else if (cmd.equals("-removeParents")) {
+        setRemoveParents(true);
       } else if (cmd.equals("-ignorePreCheckPermission")) {
         setIgnorePreCheckPermission(true);
       } else if (cmd.equals("-checkCorruptHFiles")) {

http://git-wip-us.apache.org/repos/asf/hbase/blob/c368c858/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
index 8347e5f..7a2b16d 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
@@ -201,4 +201,12 @@ public class HBaseFsckRepair {
     HRegion.closeHRegion(region);
     return region;
   }
+
+  /*
+   * Remove parent
+   */
+  public static void removeParentInMeta(Configuration conf, HRegionInfo hri) 
throws IOException {
+    Connection conn = ConnectionFactory.createConnection(conf);
+    MetaTableAccessor.deleteRegion(conn, hri);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/c368c858/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/TableIntegrityErrorHandler.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/TableIntegrityErrorHandler.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/TableIntegrityErrorHandler.java
index 4310bf8..4ca0e74 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/TableIntegrityErrorHandler.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/TableIntegrityErrorHandler.java
@@ -74,6 +74,14 @@ public interface TableIntegrityErrorHandler {
   void handleDuplicateStartKeys(HbckInfo hi1, HbckInfo hi2) throws IOException;
 
   /**
+   * Callback for handling two regions that have the same regionID
+   * a specific case of a split
+   * @param hi1 one of the overlapping HbckInfo
+   * @param hi2 the other overlapping HbckInfo
+   */
+  void handleSplit(HbckInfo hi1, HbckInfo hi2) throws IOException;
+
+  /**
    * Callback for handling two reigons that overlap in some arbitrary way.
    * This is a specific case of region overlap, and called for each possible
    * pair. If two regions have the same start key, the handleDuplicateStartKeys

http://git-wip-us.apache.org/repos/asf/hbase/blob/c368c858/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
----------------------------------------------------------------------
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
index 322bc74..bad841f 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
@@ -287,6 +287,144 @@ public class TestHBaseFsck {
   }
 
   /**
+   * Creates and fixes a bad table with a successful split that have a deployed
+   * start and end keys and region replicas enabled
+   */
+  @Test (timeout=180000)
+  public void testSplitAndDupeRegionWithRegionReplica() throws Exception {
+    TableName table =
+      TableName.valueOf("testSplitAndDupeRegionWithRegionReplica");
+    Table meta = null;
+
+    try {
+      setupTableWithRegionReplica(table, 2);
+
+      assertNoErrors(doFsck(conf, false));
+      assertEquals(ROWKEYS.length, countRows());
+
+      // No Catalog Janitor running
+      admin.enableCatalogJanitor(false);
+      meta = connection.getTable(TableName.META_TABLE_NAME, 
tableExecutorService);
+      HRegionLocation loc = this.connection.getRegionLocation(table, 
SPLITS[0], false);
+      HRegionInfo hriParent = loc.getRegionInfo();
+
+      // Split Region A just before B
+      this.connection.getAdmin().split(table, Bytes.toBytes("A@"));
+      Thread.sleep(1000);
+
+      // We need to make sure the parent region is not in a split state, so we 
put it in CLOSED state.
+      regionStates.updateRegionState(hriParent, RegionState.State.CLOSED);
+      TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriParent);
+      MetaTableAccessor.addRegionToMeta(meta, hriParent);
+      ServerName server = regionStates.getRegionServerOfRegion(hriParent);
+
+      if (server != null)
+        TEST_UTIL.assertRegionOnServer(hriParent, server, 
REGION_ONLINE_TIMEOUT);
+
+      while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriParent) 
== null) {
+        Thread.sleep(250);
+      }
+
+      LOG.debug("Finished assignment of parent region");
+
+      // TODO why is dupe region different from dupe start keys?
+      HBaseFsck hbck = doFsck(conf, false);
+      assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { 
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_DEPLOYED,
+        HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS,
+        HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS, 
HBaseFsck.ErrorReporter.ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
+      assertEquals(3, hbck.getOverlapGroups(table).size());
+
+      // fix the degenerate region.
+      hbck = new HBaseFsck(conf, hbfsckExecutorService);
+      hbck.setDisplayFullReport(); // i.e. -details
+      hbck.setTimeLag(0);
+      hbck.setFixHdfsOverlaps(true);
+      hbck.setRemoveParents(true);
+      hbck.setFixReferenceFiles(true);
+      hbck.connect();
+      hbck.onlineHbck();
+      hbck.close();
+
+      hbck = doFsck(conf, false);
+
+      assertNoErrors(hbck);
+      assertEquals(0, hbck.getOverlapGroups(table).size());
+      assertEquals(ROWKEYS.length, countRows());
+    } finally {
+      cleanupTable(table);
+    }
+  }
+
+  /**
+   * Creates and fixes a bad table with a successful split that have a deployed
+   * start and end keys
+   */
+  @Test (timeout=180000)
+  public void testSplitAndDupeRegion() throws Exception {
+    TableName table =
+      TableName.valueOf("testSplitAndDupeRegion");
+    Table meta = null;
+
+    try {
+      setupTable(table);
+
+      assertNoErrors(doFsck(conf, false));
+      assertEquals(ROWKEYS.length, countRows());
+
+      // No Catalog Janitor running
+      admin.enableCatalogJanitor(false);
+      meta = connection.getTable(TableName.META_TABLE_NAME, 
tableExecutorService);
+      HRegionLocation loc = this.connection.getRegionLocation(table, 
SPLITS[0], false);
+      HRegionInfo hriParent = loc.getRegionInfo();
+
+      // Split Region A just before B
+      this.connection.getAdmin().split(table, Bytes.toBytes("A@"));
+      Thread.sleep(1000);
+
+      // We need to make sure the parent region is not in a split state, so we 
put it in CLOSED state.
+      regionStates.updateRegionState(hriParent, RegionState.State.CLOSED);
+      TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriParent);
+
+      MetaTableAccessor.addRegionToMeta(meta, hriParent);
+      ServerName server = regionStates.getRegionServerOfRegion(hriParent);
+
+      if (server != null)
+        TEST_UTIL.assertRegionOnServer(hriParent, server, 
REGION_ONLINE_TIMEOUT);
+
+      while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriParent) 
== null) {
+        Thread.sleep(250);
+      }
+
+      LOG.debug("Finished assignment of parent region");
+
+      // TODO why is dupe region different from dupe start keys?
+      HBaseFsck hbck = doFsck(conf, false);
+      assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { 
HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS,
+        HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS, 
HBaseFsck.ErrorReporter.ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
+      assertEquals(3, hbck.getOverlapGroups(table).size());
+
+      // fix the degenerate region.
+      hbck = new HBaseFsck(conf, hbfsckExecutorService);
+      hbck.setDisplayFullReport(); // i.e. -details
+      hbck.setTimeLag(0);
+      hbck.setFixHdfsOverlaps(true);
+      hbck.setRemoveParents(true);
+      hbck.setFixReferenceFiles(true);
+      hbck.connect();
+      hbck.onlineHbck();
+      hbck.close();
+
+      hbck = doFsck(conf, false);
+
+      assertNoErrors(hbck);
+      assertEquals(0, hbck.getOverlapGroups(table).size());
+      assertEquals(ROWKEYS.length, countRows());
+    } finally {
+      cleanupTable(table);
+    }
+  }
+
+  /**
    * Create a new region in META.
    */
   private HRegionInfo createRegion(final HTableDescriptor

Reply via email to