From b74d7427855eb7e20be70155c11acac0e333bd6a Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Wed, 4 Apr 2012 19:21:01 +0000 Subject: [PATCH] HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-3042@1309554 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.HDFS-3042.txt | 2 + .../hadoop/ha/ActiveStandbyElector.java | 58 +++++++++++++---- .../hadoop/ha/ZKFailoverController.java | 6 ++ .../apache/hadoop/ha/ClientBaseWithFixes.java | 64 +++++++++++++++++++ .../hadoop/ha/TestActiveStandbyElector.java | 4 ++ .../ha/TestActiveStandbyElectorRealZK.java | 21 ++++-- .../hadoop/ha/TestZKFailoverController.java | 11 +--- .../ha/TestZKFailoverControllerStress.java | 30 +-------- .../ha/TestDFSZKFailoverController.java | 12 +--- 9 files changed, 140 insertions(+), 68 deletions(-) create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java diff --git a/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt b/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt index c3a19763b1..07e919f3e0 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.HDFS-3042.txt @@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn't handle failure to become active correc HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd) HADOOP-8215. Security support for ZK Failover controller (todd) + +HADOOP-8245. Fix flakiness in TestZKFailoverController (todd) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java index da2fa26ca0..50b85204be 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -240,8 +240,6 @@ public ActiveStandbyElector(String zookeeperHostPorts, public synchronized void joinElection(byte[] data) throws HadoopIllegalArgumentException { - LOG.debug("Attempting active election"); - if (data == null) { throw new HadoopIllegalArgumentException("data cannot be null"); } @@ -249,6 +247,7 @@ public synchronized void joinElection(byte[] data) appData = new byte[data.length]; System.arraycopy(data, 0, appData, 0, data.length); + LOG.debug("Attempting active election for " + this); joinElectionInternal(); } @@ -272,6 +271,9 @@ public synchronized boolean parentZNodeExists() */ public synchronized void ensureParentZNode() throws IOException, InterruptedException { + Preconditions.checkState(!wantToBeInElection, + "ensureParentZNode() may not be called while in the election"); + String pathParts[] = znodeWorkingDir.split("/"); Preconditions.checkArgument(pathParts.length >= 1 && "".equals(pathParts[0]), @@ -305,6 +307,9 @@ public synchronized void ensureParentZNode() */ public synchronized void clearParentZNode() throws IOException, InterruptedException { + Preconditions.checkState(!wantToBeInElection, + "clearParentZNode() may not be called while in the election"); + try { LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); @@ -393,7 +398,8 @@ public synchronized void processResult(int rc, String path, Object ctx, String name) { if (isStaleClient(ctx)) return; LOG.debug("CreateNode result: " + rc + " for path: " + path - + " connectionState: " + zkConnectionState); + + " connectionState: " + zkConnectionState + + " for " + this); Code code = Code.get(rc); if (isSuccess(code)) { @@ -449,8 +455,13 @@ public synchronized void processResult(int rc, String path, Object ctx, public synchronized void processResult(int rc, String path, Object ctx, Stat stat) { if (isStaleClient(ctx)) return; + + assert wantToBeInElection : + "Got a StatNode result after quitting election"; + LOG.debug("StatNode result: " + rc + " for path: " + path - + " connectionState: " + zkConnectionState); + + " connectionState: " + zkConnectionState + " for " + this); + Code code = Code.get(rc); if (isSuccess(code)) { @@ -517,7 +528,8 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { if (isStaleClient(zk)) return; LOG.debug("Watcher event type: " + eventType + " with state:" + event.getState() + " for path:" + event.getPath() - + " connectionState: " + zkConnectionState); + + " connectionState: " + zkConnectionState + + " for " + this); if (eventType == Event.EventType.None) { // the connection state has changed @@ -528,7 +540,8 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { // be undone ConnectionState prevConnectionState = zkConnectionState; zkConnectionState = ConnectionState.CONNECTED; - if (prevConnectionState == ConnectionState.DISCONNECTED) { + if (prevConnectionState == ConnectionState.DISCONNECTED && + wantToBeInElection) { monitorActiveStatus(); } break; @@ -600,12 +613,14 @@ protected synchronized ZooKeeper getNewZooKeeper() throws IOException { } private void fatalError(String errorMessage) { + LOG.fatal(errorMessage); reset(); appClient.notifyFatalError(errorMessage); } private void monitorActiveStatus() { - LOG.debug("Monitoring active leader"); + assert wantToBeInElection; + LOG.debug("Monitoring active leader for " + this); statRetryCount = 0; monitorLockNodeAsync(); } @@ -688,7 +703,7 @@ private boolean reEstablishSession() { int connectionRetryCount = 0; boolean success = false; while(!success && connectionRetryCount < NUM_RETRIES) { - LOG.debug("Establishing zookeeper connection"); + LOG.debug("Establishing zookeeper connection for " + this); try { createConnection(); success = true; @@ -703,13 +718,14 @@ private boolean reEstablishSession() { private void createConnection() throws IOException { zkClient = getNewZooKeeper(); + LOG.debug("Created new connection for " + this); } private void terminateConnection() { if (zkClient == null) { return; } - LOG.debug("Terminating ZK connection"); + LOG.debug("Terminating ZK connection for " + this); ZooKeeper tempZk = zkClient; zkClient = null; try { @@ -735,7 +751,7 @@ private boolean becomeActive() { Stat oldBreadcrumbStat = fenceOldActive(); writeBreadCrumbNode(oldBreadcrumbStat); - LOG.debug("Becoming active"); + LOG.debug("Becoming active for " + this); appClient.becomeActive(); state = State.ACTIVE; return true; @@ -838,7 +854,7 @@ public byte[] run() throws KeeperException, InterruptedException { private void becomeStandby() { if (state != State.STANDBY) { - LOG.debug("Becoming standby"); + LOG.debug("Becoming standby for " + this); state = State.STANDBY; appClient.becomeStandby(); } @@ -846,7 +862,7 @@ private void becomeStandby() { private void enterNeutralMode() { if (state != State.NEUTRAL) { - LOG.debug("Entering neutral mode"); + LOG.debug("Entering neutral mode for " + this); state = State.NEUTRAL; appClient.enterNeutralMode(); } @@ -943,8 +959,14 @@ private WatcherWithClientRef(ZooKeeper zk) { @Override public void process(WatchedEvent event) { - ActiveStandbyElector.this.processWatchEvent( - zk, event); + try { + ActiveStandbyElector.this.processWatchEvent( + zk, event); + } catch (Throwable t) { + fatalError( + "Failed to process watcher event " + event + ": " + + StringUtils.stringifyException(t)); + } } } @@ -972,5 +994,13 @@ private static boolean shouldRetry(Code code) { } return false; } + + @Override + public String toString() { + return "elector id=" + System.identityHashCode(this) + + " appData=" + + ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + + " cb=" + appClient; + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java index 6c05256521..55cfc56af7 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java @@ -154,6 +154,7 @@ private int doRun(String[] args) try { mainLoop(); } finally { + elector.quitElection(true); healthMonitor.shutdown(); healthMonitor.join(); } @@ -379,6 +380,11 @@ public void fenceOldActive(byte[] data) { throw new RuntimeException("Unable to fence " + target); } } + + @Override + public String toString() { + return "Elector callbacks for " + localTarget; + } } /** diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java new file mode 100644 index 0000000000..8a89fbfd5a --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/ClientBaseWithFixes.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.File; +import java.util.Set; + +import javax.management.ObjectName; + +import org.apache.zookeeper.test.ClientBase; +import org.apache.zookeeper.test.JMXEnv; + +/** + * A subclass of ZK's ClientBase testing utility, with some fixes + * necessary for running in the Hadoop context. + */ +public class ClientBaseWithFixes extends ClientBase { + + /** + * When running on the Jenkins setup, we need to ensure that this + * build directory exists before running the tests. + */ + @Override + public void setUp() throws Exception { + // build.test.dir is used by zookeeper + new File(System.getProperty("build.test.dir", "build")).mkdirs(); + super.setUp(); + } + + /** + * ZK seems to have a bug when we muck with its sessions + * behind its back, causing disconnects, etc. This bug + * ends up leaving JMX beans around at the end of the test, + * and ClientBase's teardown method will throw an exception + * if it finds JMX beans leaked. So, clear them out there + * to workaround the ZK bug. See ZOOKEEPER-1438. + */ + @Override + public void tearDown() throws Exception { + Set names = JMXEnv.ensureAll(); + for (ObjectName n : names) { + try { + JMXEnv.conn().unregisterMBean(n); + } catch (Throwable t) { + // ignore + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java index 469de10f53..2eba9671a3 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java @@ -389,6 +389,7 @@ public void testCreateNodeResultRetryNoNode() { */ @Test public void testStatNodeRetry() { + elector.joinElection(data); elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK, (Stat) null); elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK, @@ -409,6 +410,7 @@ public void testStatNodeRetry() { */ @Test public void testStatNodeError() { + elector.joinElection(data); elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME, mockZK, (Stat) null); Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); @@ -592,6 +594,8 @@ public void testSuccessiveStandbyCalls() { */ @Test public void testQuitElection() throws Exception { + elector.joinElection(data); + Mockito.verify(mockZK, Mockito.times(0)).close(); elector.quitElection(true); Mockito.verify(mockZK, Mockito.times(1)).close(); // no watches added diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java index 1800ee0d43..d51d5fa700 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java @@ -21,7 +21,6 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import java.io.File; import java.util.Collections; import java.util.UUID; @@ -32,7 +31,6 @@ import org.apache.log4j.Level; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.server.ZooKeeperServer; -import org.apache.zookeeper.test.ClientBase; import org.junit.Test; import org.mockito.AdditionalMatchers; import org.mockito.Mockito; @@ -42,7 +40,7 @@ /** * Test for {@link ActiveStandbyElector} using real zookeeper. */ -public class TestActiveStandbyElectorRealZK extends ClientBase { +public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes { static final int NUM_ELECTORS = 2; static { @@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRealZK extends ClientBase { @Override public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); super.setUp(); zkServer = getServer(serverFactory); @@ -244,4 +240,19 @@ public void testHandleSessionExpirationOfStandby() throws Exception { checkFatalsAndReset(); } + @Test(timeout=15000) + public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception { + electors[0].ensureParentZNode(); + + stopServer(); + ActiveStandbyElectorTestUtil.waitForElectorState( + null, electors[0], State.NEUTRAL); + startServer(); + waitForServerUp(hostPort, CONNECTION_TIMEOUT); + // Have to sleep to allow time for the clients to reconnect. + Thread.sleep(2000); + Mockito.verify(cbs[0], Mockito.never()).becomeActive(); + Mockito.verify(cbs[1], Mockito.never()).becomeActive(); + checkFatalsAndReset(); + } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java index e1bfb5dacf..4cdc38e529 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java @@ -19,7 +19,6 @@ import static org.junit.Assert.*; -import java.io.File; import java.security.NoSuchAlgorithmException; import org.apache.commons.logging.impl.Log4JLogger; @@ -32,12 +31,11 @@ import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.server.auth.DigestAuthenticationProvider; -import org.apache.zookeeper.test.ClientBase; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; -public class TestZKFailoverController extends ClientBase { +public class TestZKFailoverController extends ClientBaseWithFixes { private Configuration conf; private MiniZKFCCluster cluster; @@ -63,13 +61,6 @@ public class TestZKFailoverController extends ClientBase { ((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL); } - @Override - public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); - super.setUp(); - } - @Before public void setupConfAndServices() { conf = new Configuration(); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java index 9914d8f4f9..508bb00199 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java @@ -17,15 +17,10 @@ */ package org.apache.hadoop.ha; -import java.io.File; import java.util.Random; -import java.util.Set; -import javax.management.ObjectName; import org.apache.hadoop.conf.Configuration; -import org.apache.zookeeper.test.ClientBase; -import org.apache.zookeeper.test.JMXEnv; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -39,7 +34,7 @@ * failovers. While doing so, ensures that a fake "shared resource" * (simulating the shared edits dir) is only owned by one service at a time. */ -public class TestZKFailoverControllerStress extends ClientBase { +public class TestZKFailoverControllerStress extends ClientBaseWithFixes { private static final int STRESS_RUNTIME_SECS = 30; private static final int EXTRA_TIMEOUT_SECS = 10; @@ -47,13 +42,6 @@ public class TestZKFailoverControllerStress extends ClientBase { private Configuration conf; private MiniZKFCCluster cluster; - @Override - public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); - super.setUp(); - } - @Before public void setupConfAndServices() throws Exception { conf = new Configuration(); @@ -67,22 +55,6 @@ public void stopCluster() throws Exception { cluster.stop(); } - /** - * ZK seems to have a bug when we muck with its sessions - * behind its back, causing disconnects, etc. This bug - * ends up leaving JMX beans around at the end of the test, - * and ClientBase's teardown method will throw an exception - * if it finds JMX beans leaked. So, clear them out there - * to workaround the ZK bug. See ZOOKEEPER-1438. - */ - @After - public void clearZKJMX() throws Exception { - Set names = JMXEnv.ensureAll(); - for (ObjectName n : names) { - JMXEnv.conn().unregisterMBean(n); - } - } - /** * Simply fail back and forth between two services for the * configured amount of time, via expiring their ZK sessions. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSZKFailoverController.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSZKFailoverController.java index 5b4cfed667..802bbd486a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSZKFailoverController.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSZKFailoverController.java @@ -19,12 +19,12 @@ import static org.junit.Assert.*; -import java.io.File; import java.util.concurrent.TimeoutException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.ha.ClientBaseWithFixes; import org.apache.hadoop.ha.NodeFencer; import org.apache.hadoop.ha.ZKFailoverController; import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; @@ -36,7 +36,6 @@ import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread; -import org.apache.zookeeper.test.ClientBase; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -44,20 +43,13 @@ import com.google.common.base.Supplier; -public class TestDFSZKFailoverController extends ClientBase { +public class TestDFSZKFailoverController extends ClientBaseWithFixes { private Configuration conf; private MiniDFSCluster cluster; private TestContext ctx; private ZKFCThread thr1, thr2; private FileSystem fs; - @Override - public void setUp() throws Exception { - // build.test.dir is used by zookeeper - new File(System.getProperty("build.test.dir", "build")).mkdirs(); - super.setUp(); - } - @Before public void setup() throws Exception { conf = new Configuration();