Author: todd Date: Wed Apr 4 19:21:01 2012 New Revision: 1309554 URL: http://svn.apache.org/viewvc?rev=1309554&view=rev Log: HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon.
Added: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt Wed Apr 4 19:21:01 2012 @@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn' HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd) HADOOP-8215. Security support for ZK Failover controller (todd) + +HADOOP-8245. Fix flakiness in TestZKFailoverController (todd) Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java Wed Apr 4 19:21:01 2012 @@ -240,8 +240,6 @@ public class ActiveStandbyElector implem public synchronized void joinElection(byte[] data) throws HadoopIllegalArgumentException { - LOG.debug("Attempting active election"); - if (data == null) { throw new HadoopIllegalArgumentException("data cannot be null"); } @@ -249,6 +247,7 @@ public class ActiveStandbyElector implem appData = new byte[data.length]; System.arraycopy(data, 0, appData, 0, data.length); + LOG.debug("Attempting active election for " + this); joinElectionInternal(); } @@ -272,6 +271,9 @@ public class ActiveStandbyElector implem */ public synchronized void ensureParentZNode() throws IOException, InterruptedException { + Preconditions.checkState(!wantToBeInElection, + "ensureParentZNode() may not be called while in the election"); + String pathParts[] = znodeWorkingDir.split("/"); Preconditions.checkArgument(pathParts.length >= 1 && "".equals(pathParts[0]), @@ -305,6 +307,9 @@ public class ActiveStandbyElector implem */ public synchronized void clearParentZNode() throws IOException, InterruptedException { + Preconditions.checkState(!wantToBeInElection, + "clearParentZNode() may not be called while in the election"); + try { LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); @@ -393,7 +398,8 @@ public class ActiveStandbyElector implem String name) { if (isStaleClient(ctx)) return; LOG.debug("CreateNode result: " + rc + " for path: " + path - + " connectionState: " + zkConnectionState); + + " connectionState: " + zkConnectionState + + " for " + this); Code code = Code.get(rc); if (isSuccess(code)) { @@ -449,8 +455,13 @@ public class ActiveStandbyElector implem public synchronized void processResult(int rc, String path, Object ctx, Stat stat) { if (isStaleClient(ctx)) return; + + assert wantToBeInElection : + "Got a StatNode result after quitting election"; + LOG.debug("StatNode result: " + rc + " for path: " + path - + " connectionState: " + zkConnectionState); + + " connectionState: " + zkConnectionState + " for " + this); + Code code = Code.get(rc); if (isSuccess(code)) { @@ -517,7 +528,8 @@ public class ActiveStandbyElector implem if (isStaleClient(zk)) return; LOG.debug("Watcher event type: " + eventType + " with state:" + event.getState() + " for path:" + event.getPath() - + " connectionState: " + zkConnectionState); + + " connectionState: " + zkConnectionState + + " for " + this); if (eventType == Event.EventType.None) { // the connection state has changed @@ -528,7 +540,8 @@ public class ActiveStandbyElector implem // be undone ConnectionState prevConnectionState = zkConnectionState; zkConnectionState = ConnectionState.CONNECTED; - if (prevConnectionState == ConnectionState.DISCONNECTED) { + if (prevConnectionState == ConnectionState.DISCONNECTED && + wantToBeInElection) { monitorActiveStatus(); } break; @@ -600,12 +613,14 @@ public class ActiveStandbyElector implem } private void fatalError(String errorMessage) { + LOG.fatal(errorMessage); reset(); appClient.notifyFatalError(errorMessage); } private void monitorActiveStatus() { - LOG.debug("Monitoring active leader"); + assert wantToBeInElection; + LOG.debug("Monitoring active leader for " + this); statRetryCount = 0; monitorLockNodeAsync(); } @@ -688,7 +703,7 @@ public class ActiveStandbyElector implem int connectionRetryCount = 0; boolean success = false; while(!success && connectionRetryCount < NUM_RETRIES) { - LOG.debug("Establishing zookeeper connection"); + LOG.debug("Establishing zookeeper connection for " + this); try { createConnection(); success = true; @@ -703,13 +718,14 @@ public class ActiveStandbyElector implem private void createConnection() throws IOException { zkClient = getNewZooKeeper(); + LOG.debug("Created new connection for " + this); } private void terminateConnection() { if (zkClient == null) { return; } - LOG.debug("Terminating ZK connection"); + LOG.debug("Terminating ZK connection for " + this); ZooKeeper tempZk = zkClient; zkClient = null; try { @@ -735,7 +751,7 @@ public class ActiveStandbyElector implem Stat oldBreadcrumbStat = fenceOldActive(); writeBreadCrumbNode(oldBreadcrumbStat); - LOG.debug("Becoming active"); + LOG.debug("Becoming active for " + this); appClient.becomeActive(); state = State.ACTIVE; return true; @@ -838,7 +854,7 @@ public class ActiveStandbyElector implem private void becomeStandby() { if (state != State.STANDBY) { - LOG.debug("Becoming standby"); + LOG.debug("Becoming standby for " + this); state = State.STANDBY; appClient.becomeStandby(); } @@ -846,7 +862,7 @@ public class ActiveStandbyElector implem private void enterNeutralMode() { if (state != State.NEUTRAL) { - LOG.debug("Entering neutral mode"); + LOG.debug("Entering neutral mode for " + this); state = State.NEUTRAL; appClient.enterNeutralMode(); } @@ -943,8 +959,14 @@ public class ActiveStandbyElector implem @Override public void process(WatchedEvent event) { - ActiveStandbyElector.this.processWatchEvent( - zk, event); + try { + ActiveStandbyElector.this.processWatchEvent( + zk, event); + } catch (Throwable t) { + fatalError( + "Failed to process watcher event " + event + ": " + + StringUtils.stringifyException(t)); + } } } @@ -972,5 +994,13 @@ public class ActiveStandbyElector implem } return false; } + + @Override + public String toString() { + return "elector id=" + System.identityHashCode(this) + + " appData=" + + ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + + " cb=" + appClient; + } } Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java Wed Apr 4 19:21:01 2012 @@ -154,6 +154,7 @@ public abstract class ZKFailoverControll try { mainLoop(); } finally { + elector.quitElection(true); healthMonitor.shutdown(); healthMonitor.join(); } @@ -379,6 +380,11 @@ public abstract class ZKFailoverControll throw new RuntimeException("Unable to fence " + target); } } + + @Override + public String toString() { + return "Elector callbacks for " + localTarget; + } } /** Added: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java?rev=1309554&view=auto ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java (added) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java Wed Apr 4 19:21:01 2012 @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.File; +import java.util.Set; + +import javax.management.ObjectName; + +import org.apache.zookeeper.test.ClientBase; +import org.apache.zookeeper.test.JMXEnv; + +/** + * A subclass of ZK's ClientBase testing utility, with some fixes + * necessary for running in the Hadoop context. + */ +public class ClientBaseWithFixes extends ClientBase { + + /** + * When running on the Jenkins setup, we need to ensure that this + * build directory exists before running the tests. + */ + @Override + public void setUp() throws Exception { + // build.test.dir is used by zookeeper + new File(System.getProperty("build.test.dir", "build")).mkdirs(); + super.setUp(); + } + + /** + * ZK seems to have a bug when we muck with its sessions + * behind its back, causing disconnects, etc. This bug + * ends up leaving JMX beans around at the end of the test, + * and ClientBase's teardown method will throw an exception + * if it finds JMX beans leaked. So, clear them out there + * to workaround the ZK bug. See ZOOKEEPER-1438. + */ + @Override + public void tearDown() throws Exception { + Set<ObjectName> names = JMXEnv.ensureAll(); + for (ObjectName n : names) { + try { + JMXEnv.conn().unregisterMBean(n); + } catch (Throwable t) { + // ignore + } + } + } +} Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java Wed Apr 4 19:21:01 2012 @@ -389,6 +389,7 @@ public class TestActiveStandbyElector { */ @Test public void testStatNodeRetry() { + elector.joinElection(data); elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK, (Stat) null); elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK, @@ -409,6 +410,7 @@ public class TestActiveStandbyElector { */ @Test public void testStatNodeError() { + elector.joinElection(data); elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME, mockZK, (Stat) null); Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); @@ -592,6 +594,8 @@ public class TestActiveStandbyElector { */ @Test public void testQuitElection() throws Exception { + elector.joinElection(data); + Mockito.verify(mockZK, Mockito.times(0)).close(); elector.quitElection(true); Mockito.verify(mockZK, Mockito.times(1)).close(); // no watches added Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java Wed Apr 4 19:21:01 2012 @@ -21,7 +21,6 @@ package org.apache.hadoop.ha; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import java.io.File; import java.util.Collections; import java.util.UUID; @@ -32,7 +31,6 @@ import org.apache.hadoop.ha.HAZKUtil.ZKA import org.apache.log4j.Level; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.server.ZooKeeperServer; -import org.apache.zookeeper.test.ClientBase; import org.junit.Test; import org.mockito.AdditionalMatchers; import org.mockito.Mockito; @@ -42,7 +40,7 @@ import com.google.common.primitives.Ints /** * Test for {@link ActiveStandbyElector} using real zookeeper. */ -public class TestActiveStandbyElectorRealZK extends ClientBase { +public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes { static final int NUM_ELECTORS = 2; static { @@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRea @Override public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); super.setUp(); zkServer = getServer(serverFactory); @@ -244,4 +240,19 @@ public class TestActiveStandbyElectorRea checkFatalsAndReset(); } + @Test(timeout=15000) + public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception { + electors[0].ensureParentZNode(); + + stopServer(); + ActiveStandbyElectorTestUtil.waitForElectorState( + null, electors[0], State.NEUTRAL); + startServer(); + waitForServerUp(hostPort, CONNECTION_TIMEOUT); + // Have to sleep to allow time for the clients to reconnect. + Thread.sleep(2000); + Mockito.verify(cbs[0], Mockito.never()).becomeActive(); + Mockito.verify(cbs[1], Mockito.never()).becomeActive(); + checkFatalsAndReset(); + } } Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java Wed Apr 4 19:21:01 2012 @@ -19,7 +19,6 @@ package org.apache.hadoop.ha; import static org.junit.Assert.*; -import java.io.File; import java.security.NoSuchAlgorithmException; import org.apache.commons.logging.impl.Log4JLogger; @@ -32,12 +31,11 @@ import org.apache.zookeeper.KeeperExcept import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.server.auth.DigestAuthenticationProvider; -import org.apache.zookeeper.test.ClientBase; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; -public class TestZKFailoverController extends ClientBase { +public class TestZKFailoverController extends ClientBaseWithFixes { private Configuration conf; private MiniZKFCCluster cluster; @@ -63,13 +61,6 @@ public class TestZKFailoverController ex ((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL); } - @Override - public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); - super.setUp(); - } - @Before public void setupConfAndServices() { conf = new Configuration(); Modified: hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java?rev=1309554&r1=1309553&r2=1309554&view=diff ============================================================================== --- hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java (original) +++ hadoop/common/branches/HDFS-3042/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java Wed Apr 4 19:21:01 2012 @@ -17,15 +17,10 @@ */ package org.apache.hadoop.ha; -import java.io.File; import java.util.Random; -import java.util.Set; -import javax.management.ObjectName; import org.apache.hadoop.conf.Configuration; -import org.apache.zookeeper.test.ClientBase; -import org.apache.zookeeper.test.JMXEnv; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -39,7 +34,7 @@ import org.mockito.stubbing.Answer; * failovers. While doing so, ensures that a fake "shared resource" * (simulating the shared edits dir) is only owned by one service at a time. */ -public class TestZKFailoverControllerStress extends ClientBase { +public class TestZKFailoverControllerStress extends ClientBaseWithFixes { private static final int STRESS_RUNTIME_SECS = 30; private static final int EXTRA_TIMEOUT_SECS = 10; @@ -47,13 +42,6 @@ public class TestZKFailoverControllerStr private Configuration conf; private MiniZKFCCluster cluster; - @Override - public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); - super.setUp(); - } - @Before public void setupConfAndServices() throws Exception { conf = new Configuration(); @@ -68,22 +56,6 @@ public class TestZKFailoverControllerStr } /** - * ZK seems to have a bug when we muck with its sessions - * behind its back, causing disconnects, etc. This bug - * ends up leaving JMX beans around at the end of the test, - * and ClientBase's teardown method will throw an exception - * if it finds JMX beans leaked. So, clear them out there - * to workaround the ZK bug. See ZOOKEEPER-1438. - */ - @After - public void clearZKJMX() throws Exception { - Set<ObjectName> names = JMXEnv.ensureAll(); - for (ObjectName n : names) { - JMXEnv.conn().unregisterMBean(n); - } - } - - /** * Simply fail back and forth between two services for the * configured amount of time, via expiring their ZK sessions. */