Author: tedyu
Date: Tue Feb 14 17:54:23 2012
New Revision: 1244137
URL: http://svn.apache.org/viewvc?rev=1244137&view=rev
Log:
HBASE-5379 Backport HBASE-4287 to 0.90 - If region opening fails, try to
transition region back to
"offline" in ZK (Ram)
Modified:
hbase/branches/0.90/CHANGES.txt
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
Modified: hbase/branches/0.90/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/CHANGES.txt?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/CHANGES.txt (original)
+++ hbase/branches/0.90/CHANGES.txt Tue Feb 14 17:54:23 2012
@@ -45,6 +45,8 @@ Release 0.90.6 - Unreleased
HBASE-5321 this.allRegionServersOffline not set to false after one RS
comes online and
assignment is done in 0.90. (Ram)
HBASE-5377 Fix licenses on the 0.90 branch.
+ HBASE-5379 Backport HBASE-4287 to 0.90 - If region opening fails, try to
transition region back to
+ "offline" in ZK
IMPROVEMENT
HBASE-5102 Change the default value of the property
"hbase.connection.per.config" to false in
Modified:
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
---
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
(original)
+++
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
Tue Feb 14 17:54:23 2012
@@ -107,6 +107,7 @@ public abstract class EventHandler imple
RS_ZK_REGION_CLOSED (2), // RS has finished closing a region
RS_ZK_REGION_OPENING (3), // RS is in process of opening a region
RS_ZK_REGION_OPENED (4), // RS has finished opening a region
+ RS_ZK_REGION_FAILED_OPEN (5), // RS failed to open a region
// Messages originating from Master to RS
M_RS_OPEN_REGION (20), // Master asking RS to open a region
Modified:
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
---
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
(original)
+++
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
Tue Feb 14 17:54:23 2012
@@ -107,6 +107,7 @@ public class ExecutorService {
// Master executor services
case RS_ZK_REGION_CLOSED:
+ case RS_ZK_REGION_FAILED_OPEN:
return ExecutorType.MASTER_CLOSE_REGION;
case RS_ZK_REGION_OPENED:
Modified:
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
---
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Tue Feb 14 17:54:23 2012
@@ -24,6 +24,7 @@ import java.io.DataOutput;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@@ -138,6 +139,10 @@ public class AssignmentManager extends Z
new TreeMap<HRegionInfo,HServerInfo>();
private final ExecutorService executorService;
+
+ private List<EventType> ignoreStatesRSOffline = Arrays
+ .asList(new EventType[] { EventType.RS_ZK_REGION_FAILED_OPEN,
+ EventType.RS_ZK_REGION_CLOSED });
/**
* Constructs a new assignment manager.
@@ -346,6 +351,7 @@ public class AssignmentManager extends Z
break;
case RS_ZK_REGION_CLOSED:
+ case RS_ZK_REGION_FAILED_OPEN:
// Region is closed, insert into RIT and handle it
addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data);
break;
@@ -473,7 +479,8 @@ public class AssignmentManager extends Z
}
// Verify this is a known server
if (!serverManager.isServerOnline(data.getServerName()) &&
- !this.master.getServerName().equals(data.getServerName())) {
+ !this.master.getServerName().equals(data.getServerName())
+ && !ignoreStatesRSOffline.contains(data.getEventType())) {
LOG.warn("Attempted to handle region transition for server but " +
"server is not online: " + Bytes.toString(data.getRegionName()));
return;
@@ -541,6 +548,20 @@ public class AssignmentManager extends Z
// Transition to OPENING (or update stamp if already OPENING)
regionState.update(RegionState.State.OPENING, data.getStamp());
break;
+ case RS_ZK_REGION_FAILED_OPEN:
+ if (regionState == null
+ || (!regionState.isPendingOpen() && !regionState.isOpening())) {
+ LOG.warn("Received FAILED_OPEN for region " + prettyPrintedRegionName
+ + " from server " + data.getServerName() + " but region was in "
+ + " the state " + regionState
+ + " and not in PENDING_OPEN or OPENING");
+ return;
+ }
+ // Handle this the same as if it were opened and then closed.
+ regionState.update(RegionState.State.CLOSED, data.getStamp());
+ this.executorService.submit(new ClosedRegionHandler(master, this,
+ regionState.getRegion()));
+ break;
case RS_ZK_REGION_OPENED:
// Should see OPENED after OPENING but possible after PENDING_OPEN
@@ -1996,8 +2017,8 @@ public class AssignmentManager extends Z
LOG.debug("Region has transitioned to OPENED, allowing " +
"watched event handlers to process");
break;
- } else if (data.getEventType() !=
- EventType.RS_ZK_REGION_OPENING) {
+ } else if (data.getEventType() != EventType.RS_ZK_REGION_OPENING
+ && data.getEventType() != EventType.RS_ZK_REGION_FAILED_OPEN) {
LOG.warn("While timing out a region in state OPENING, " +
"found ZK node in unexpected state: " +
data.getEventType());
Modified:
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
---
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
(original)
+++
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
Tue Feb 14 17:54:23 2012
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.executor.EventHandler;
+import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
import org.apache.hadoop.hbase.util.CancelableProgressable;
@@ -95,13 +96,17 @@ public class OpenRegionHandler extends E
if (!transitionZookeeperOfflineToOpening(encodedName)) {
LOG.warn("Region was hijacked? It no longer exists, encodedName=" +
encodedName);
+ tryTransitionToFailedOpen(regionInfo);
return;
}
// Open region. After a successful open, failures in subsequent
// processing needs to do a close as part of cleanup.
region = openRegion();
- if (region == null) return;
+ if (region == null) {
+ tryTransitionToFailedOpen(regionInfo);
+ return;
+ }
boolean failed = true;
if (tickleOpening("post_region_open")) {
if (updateMeta(region)) failed = false;
@@ -110,10 +115,17 @@ public class OpenRegionHandler extends E
if (failed || this.server.isStopped() ||
this.rsServices.isStopping()) {
cleanupFailedOpen(region);
+ tryTransitionToFailedOpen(regionInfo);
return;
}
if (!transitionToOpened(region)) {
+ // If we fail to transition to opened, it's because of one of two
cases:
+ // (a) we lost our ZK lease
+ // OR (b) someone else opened the region before us
+ // In either case, we don't need to transition to FAILED_OPEN state.
+ // In case (a), the Master will process us as a dead server. In case
+ // (b) the region is already being handled elsewhere anyway.
cleanupFailedOpen(region);
return;
}
@@ -125,6 +137,35 @@ public class OpenRegionHandler extends E
remove(this.regionInfo.getEncodedNameAsBytes());
}
}
+ /**
+ * @param Region we're working on.
+ * This is not guaranteed to succeed, we just do our best.
+ * @return whether znode is successfully transitioned to FAILED_OPEN state.
+ */
+ private boolean tryTransitionToFailedOpen(final HRegionInfo hri) {
+ boolean result = false;
+ final String name = hri.getRegionNameAsString();
+ try {
+ LOG.info("Opening of region " + hri
+ + " failed, marking as FAILED_OPEN in ZK");
+ if (ZKAssign.transitionNode(
+ this.server.getZooKeeper(), hri,
+ this.server.getServerName(),
+ EventType.RS_ZK_REGION_OPENING,
+ EventType.RS_ZK_REGION_FAILED_OPEN,
+ this.version) == -1) {
+ LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
+ "It's likely that the master already timed out this open " +
+ "attempt, and thus another RS already has the region.");
+ } else {
+ result = true;
+ }
+ } catch (KeeperException e) {
+ LOG.error("Failed transitioning node " + name +
+ " from OPENING to FAILED_OPEN", e);
+ }
+ return result;
+ }
/**
* Update ZK, ROOT or META. This can take a while if for example the
@@ -133,7 +174,7 @@ public class OpenRegionHandler extends E
* state meantime so master doesn't timeout our region-in-transition.
* Caller must cleanup region if this fails.
*/
- private boolean updateMeta(final HRegion r) {
+ boolean updateMeta(final HRegion r) {
if (this.server.isStopped() || this.rsServices.isStopping()) {
return false;
}
@@ -285,11 +326,12 @@ public class OpenRegionHandler extends E
return tickleOpening("open_region_progress");
}
});
- } catch (IOException e) {
- // We failed open. Let our znode expire in regions-in-transition and
- // Master will assign elsewhere. Presumes nothing to close.
+ } catch (Throwable t) {
+ // We failed open. Our caller will see the 'null' return value
+ // and transition the node back to FAILED_OPEN. If that fails,
+ // we rely on the Timeout Monitor in the master to reassign.
LOG.error("Failed open of region=" +
- this.regionInfo.getRegionNameAsString(), e);
+ this.regionInfo.getRegionNameAsString(), t);
}
return region;
}
Modified:
hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
URL:
http://svn.apache.org/viewvc/hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
---
hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
(original)
+++
hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
Tue Feb 14 17:54:23 2012
@@ -19,6 +19,8 @@
*/
package org.apache.hadoop.hbase.regionserver.handler;
+import static org.junit.Assert.assertEquals;
+
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseTestingUtility;
@@ -26,8 +28,11 @@ import org.apache.hadoop.hbase.HConstant
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.Server;
+import org.apache.hadoop.hbase.executor.RegionTransitionData;
+import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.MockRegionServerServices;
import org.apache.hadoop.hbase.util.MockServer;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
@@ -36,6 +41,7 @@ import org.apache.hadoop.hbase.zookeeper
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.junit.AfterClass;
+import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -44,14 +50,31 @@ import org.junit.Test;
*/
public class TestOpenRegionHandler {
private final static HBaseTestingUtility HTU = new HBaseTestingUtility();
+ private static HTableDescriptor TEST_HTD;
+ private HRegionInfo TEST_HRI;
+
+ private int testIndex = 0;
@BeforeClass public static void before() throws Exception {
HTU.startMiniZKCluster();
+ TEST_HTD = new HTableDescriptor("TestOpenRegionHandler.java");
}
@AfterClass public static void after() throws IOException {
+ TEST_HTD = null;
HTU.shutdownMiniZKCluster();
}
+ /**
+ * Before each test, use a different HRI, so the different tests
+ * don't interfere with each other. This allows us to use just
+ * a single ZK cluster for the whole suite.
+ */
+ @Before
+ public void setupHRI() {
+ TEST_HRI = new HRegionInfo(TEST_HTD, Bytes.toBytes(testIndex), Bytes
+ .toBytes(testIndex + 1));
+ testIndex++;
+ }
/**
* Test the openregionhandler can deal with its znode being yanked out from
@@ -94,4 +117,55 @@ public class TestOpenRegionHandler {
// post OPENING; again will expect it to come back w/o NPE or exception.
handler.process();
}
+ @Test
+ public void testFailedOpenRegion() throws Exception {
+ Server server = new MockServer();
+ RegionServerServices rsServices = new MockRegionServerServices();
+
+ // Create it OFFLINE, which is what it expects
+ ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server
+ .getServerName());
+
+ // Create the handler
+ OpenRegionHandler handler =
+ new OpenRegionHandler(server, rsServices, TEST_HRI) {
+ @Override
+ HRegion openRegion() {
+ // Fake failure of opening a region due to an IOE, which is caught
+ return null;
+ }
+ };
+ handler.process();
+
+ // Handler should have transitioned it to FAILED_OPEN
+ RegionTransitionData data =
+ ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
+ assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
+ }
+
+ @Test
+ public void testFailedUpdateMeta() throws Exception {
+ Server server = new MockServer();
+ RegionServerServices rsServices = new MockRegionServerServices();
+
+ // Create it OFFLINE, which is what it expects
+ ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server
+ .getServerName());
+
+ // Create the handler
+ OpenRegionHandler handler =
+ new OpenRegionHandler(server, rsServices, TEST_HRI) {
+ @Override
+ boolean updateMeta(final HRegion r) {
+ // Fake failure of updating META
+ return false;
+ }
+ };
+ handler.process();
+
+ // Handler should have transitioned it to FAILED_OPEN
+ RegionTransitionData data =
+ ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
+ assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
+ }
}