HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-3042@1309554 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
30e1b3bba8
commit
b74d742785
@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn't handle failure to become active correc
|
||||
HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd)
|
||||
|
||||
HADOOP-8215. Security support for ZK Failover controller (todd)
|
||||
|
||||
HADOOP-8245. Fix flakiness in TestZKFailoverController (todd)
|
||||
|
@ -240,8 +240,6 @@ public ActiveStandbyElector(String zookeeperHostPorts,
|
||||
public synchronized void joinElection(byte[] data)
|
||||
throws HadoopIllegalArgumentException {
|
||||
|
||||
LOG.debug("Attempting active election");
|
||||
|
||||
if (data == null) {
|
||||
throw new HadoopIllegalArgumentException("data cannot be null");
|
||||
}
|
||||
@ -249,6 +247,7 @@ public synchronized void joinElection(byte[] data)
|
||||
appData = new byte[data.length];
|
||||
System.arraycopy(data, 0, appData, 0, data.length);
|
||||
|
||||
LOG.debug("Attempting active election for " + this);
|
||||
joinElectionInternal();
|
||||
}
|
||||
|
||||
@ -272,6 +271,9 @@ public synchronized boolean parentZNodeExists()
|
||||
*/
|
||||
public synchronized void ensureParentZNode()
|
||||
throws IOException, InterruptedException {
|
||||
Preconditions.checkState(!wantToBeInElection,
|
||||
"ensureParentZNode() may not be called while in the election");
|
||||
|
||||
String pathParts[] = znodeWorkingDir.split("/");
|
||||
Preconditions.checkArgument(pathParts.length >= 1 &&
|
||||
"".equals(pathParts[0]),
|
||||
@ -305,6 +307,9 @@ public synchronized void ensureParentZNode()
|
||||
*/
|
||||
public synchronized void clearParentZNode()
|
||||
throws IOException, InterruptedException {
|
||||
Preconditions.checkState(!wantToBeInElection,
|
||||
"clearParentZNode() may not be called while in the election");
|
||||
|
||||
try {
|
||||
LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
|
||||
|
||||
@ -393,7 +398,8 @@ public synchronized void processResult(int rc, String path, Object ctx,
|
||||
String name) {
|
||||
if (isStaleClient(ctx)) return;
|
||||
LOG.debug("CreateNode result: " + rc + " for path: " + path
|
||||
+ " connectionState: " + zkConnectionState);
|
||||
+ " connectionState: " + zkConnectionState +
|
||||
" for " + this);
|
||||
|
||||
Code code = Code.get(rc);
|
||||
if (isSuccess(code)) {
|
||||
@ -449,8 +455,13 @@ public synchronized void processResult(int rc, String path, Object ctx,
|
||||
public synchronized void processResult(int rc, String path, Object ctx,
|
||||
Stat stat) {
|
||||
if (isStaleClient(ctx)) return;
|
||||
|
||||
assert wantToBeInElection :
|
||||
"Got a StatNode result after quitting election";
|
||||
|
||||
LOG.debug("StatNode result: " + rc + " for path: " + path
|
||||
+ " connectionState: " + zkConnectionState);
|
||||
+ " connectionState: " + zkConnectionState + " for " + this);
|
||||
|
||||
|
||||
Code code = Code.get(rc);
|
||||
if (isSuccess(code)) {
|
||||
@ -517,7 +528,8 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
|
||||
if (isStaleClient(zk)) return;
|
||||
LOG.debug("Watcher event type: " + eventType + " with state:"
|
||||
+ event.getState() + " for path:" + event.getPath()
|
||||
+ " connectionState: " + zkConnectionState);
|
||||
+ " connectionState: " + zkConnectionState
|
||||
+ " for " + this);
|
||||
|
||||
if (eventType == Event.EventType.None) {
|
||||
// the connection state has changed
|
||||
@ -528,7 +540,8 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
|
||||
// be undone
|
||||
ConnectionState prevConnectionState = zkConnectionState;
|
||||
zkConnectionState = ConnectionState.CONNECTED;
|
||||
if (prevConnectionState == ConnectionState.DISCONNECTED) {
|
||||
if (prevConnectionState == ConnectionState.DISCONNECTED &&
|
||||
wantToBeInElection) {
|
||||
monitorActiveStatus();
|
||||
}
|
||||
break;
|
||||
@ -600,12 +613,14 @@ protected synchronized ZooKeeper getNewZooKeeper() throws IOException {
|
||||
}
|
||||
|
||||
private void fatalError(String errorMessage) {
|
||||
LOG.fatal(errorMessage);
|
||||
reset();
|
||||
appClient.notifyFatalError(errorMessage);
|
||||
}
|
||||
|
||||
private void monitorActiveStatus() {
|
||||
LOG.debug("Monitoring active leader");
|
||||
assert wantToBeInElection;
|
||||
LOG.debug("Monitoring active leader for " + this);
|
||||
statRetryCount = 0;
|
||||
monitorLockNodeAsync();
|
||||
}
|
||||
@ -688,7 +703,7 @@ private boolean reEstablishSession() {
|
||||
int connectionRetryCount = 0;
|
||||
boolean success = false;
|
||||
while(!success && connectionRetryCount < NUM_RETRIES) {
|
||||
LOG.debug("Establishing zookeeper connection");
|
||||
LOG.debug("Establishing zookeeper connection for " + this);
|
||||
try {
|
||||
createConnection();
|
||||
success = true;
|
||||
@ -703,13 +718,14 @@ private boolean reEstablishSession() {
|
||||
|
||||
private void createConnection() throws IOException {
|
||||
zkClient = getNewZooKeeper();
|
||||
LOG.debug("Created new connection for " + this);
|
||||
}
|
||||
|
||||
private void terminateConnection() {
|
||||
if (zkClient == null) {
|
||||
return;
|
||||
}
|
||||
LOG.debug("Terminating ZK connection");
|
||||
LOG.debug("Terminating ZK connection for " + this);
|
||||
ZooKeeper tempZk = zkClient;
|
||||
zkClient = null;
|
||||
try {
|
||||
@ -735,7 +751,7 @@ private boolean becomeActive() {
|
||||
Stat oldBreadcrumbStat = fenceOldActive();
|
||||
writeBreadCrumbNode(oldBreadcrumbStat);
|
||||
|
||||
LOG.debug("Becoming active");
|
||||
LOG.debug("Becoming active for " + this);
|
||||
appClient.becomeActive();
|
||||
state = State.ACTIVE;
|
||||
return true;
|
||||
@ -838,7 +854,7 @@ public byte[] run() throws KeeperException, InterruptedException {
|
||||
|
||||
private void becomeStandby() {
|
||||
if (state != State.STANDBY) {
|
||||
LOG.debug("Becoming standby");
|
||||
LOG.debug("Becoming standby for " + this);
|
||||
state = State.STANDBY;
|
||||
appClient.becomeStandby();
|
||||
}
|
||||
@ -846,7 +862,7 @@ private void becomeStandby() {
|
||||
|
||||
private void enterNeutralMode() {
|
||||
if (state != State.NEUTRAL) {
|
||||
LOG.debug("Entering neutral mode");
|
||||
LOG.debug("Entering neutral mode for " + this);
|
||||
state = State.NEUTRAL;
|
||||
appClient.enterNeutralMode();
|
||||
}
|
||||
@ -943,8 +959,14 @@ private WatcherWithClientRef(ZooKeeper zk) {
|
||||
|
||||
@Override
|
||||
public void process(WatchedEvent event) {
|
||||
ActiveStandbyElector.this.processWatchEvent(
|
||||
zk, event);
|
||||
try {
|
||||
ActiveStandbyElector.this.processWatchEvent(
|
||||
zk, event);
|
||||
} catch (Throwable t) {
|
||||
fatalError(
|
||||
"Failed to process watcher event " + event + ": " +
|
||||
StringUtils.stringifyException(t));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -973,4 +995,12 @@ private static boolean shouldRetry(Code code) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "elector id=" + System.identityHashCode(this) +
|
||||
" appData=" +
|
||||
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
|
||||
" cb=" + appClient;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -154,6 +154,7 @@ private int doRun(String[] args)
|
||||
try {
|
||||
mainLoop();
|
||||
} finally {
|
||||
elector.quitElection(true);
|
||||
healthMonitor.shutdown();
|
||||
healthMonitor.join();
|
||||
}
|
||||
@ -379,6 +380,11 @@ public void fenceOldActive(byte[] data) {
|
||||
throw new RuntimeException("Unable to fence " + target);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Elector callbacks for " + localTarget;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -0,0 +1,64 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.ha;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.management.ObjectName;
|
||||
|
||||
import org.apache.zookeeper.test.ClientBase;
|
||||
import org.apache.zookeeper.test.JMXEnv;
|
||||
|
||||
/**
|
||||
* A subclass of ZK's ClientBase testing utility, with some fixes
|
||||
* necessary for running in the Hadoop context.
|
||||
*/
|
||||
public class ClientBaseWithFixes extends ClientBase {
|
||||
|
||||
/**
|
||||
* When running on the Jenkins setup, we need to ensure that this
|
||||
* build directory exists before running the tests.
|
||||
*/
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
// build.test.dir is used by zookeeper
|
||||
new File(System.getProperty("build.test.dir", "build")).mkdirs();
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
/**
|
||||
* ZK seems to have a bug when we muck with its sessions
|
||||
* behind its back, causing disconnects, etc. This bug
|
||||
* ends up leaving JMX beans around at the end of the test,
|
||||
* and ClientBase's teardown method will throw an exception
|
||||
* if it finds JMX beans leaked. So, clear them out there
|
||||
* to workaround the ZK bug. See ZOOKEEPER-1438.
|
||||
*/
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
Set<ObjectName> names = JMXEnv.ensureAll();
|
||||
for (ObjectName n : names) {
|
||||
try {
|
||||
JMXEnv.conn().unregisterMBean(n);
|
||||
} catch (Throwable t) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -389,6 +389,7 @@ public void testCreateNodeResultRetryNoNode() {
|
||||
*/
|
||||
@Test
|
||||
public void testStatNodeRetry() {
|
||||
elector.joinElection(data);
|
||||
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
|
||||
(Stat) null);
|
||||
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
|
||||
@ -409,6 +410,7 @@ public void testStatNodeRetry() {
|
||||
*/
|
||||
@Test
|
||||
public void testStatNodeError() {
|
||||
elector.joinElection(data);
|
||||
elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME,
|
||||
mockZK, (Stat) null);
|
||||
Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
|
||||
@ -592,6 +594,8 @@ public void testSuccessiveStandbyCalls() {
|
||||
*/
|
||||
@Test
|
||||
public void testQuitElection() throws Exception {
|
||||
elector.joinElection(data);
|
||||
Mockito.verify(mockZK, Mockito.times(0)).close();
|
||||
elector.quitElection(true);
|
||||
Mockito.verify(mockZK, Mockito.times(1)).close();
|
||||
// no watches added
|
||||
|
@ -21,7 +21,6 @@
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.UUID;
|
||||
|
||||
@ -32,7 +31,6 @@
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.zookeeper.ZooDefs.Ids;
|
||||
import org.apache.zookeeper.server.ZooKeeperServer;
|
||||
import org.apache.zookeeper.test.ClientBase;
|
||||
import org.junit.Test;
|
||||
import org.mockito.AdditionalMatchers;
|
||||
import org.mockito.Mockito;
|
||||
@ -42,7 +40,7 @@
|
||||
/**
|
||||
* Test for {@link ActiveStandbyElector} using real zookeeper.
|
||||
*/
|
||||
public class TestActiveStandbyElectorRealZK extends ClientBase {
|
||||
public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes {
|
||||
static final int NUM_ELECTORS = 2;
|
||||
|
||||
static {
|
||||
@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRealZK extends ClientBase {
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
// build.test.dir is used by zookeeper
|
||||
new File(System.getProperty("build.test.dir", "build")).mkdirs();
|
||||
super.setUp();
|
||||
|
||||
zkServer = getServer(serverFactory);
|
||||
@ -244,4 +240,19 @@ public void testHandleSessionExpirationOfStandby() throws Exception {
|
||||
checkFatalsAndReset();
|
||||
}
|
||||
|
||||
@Test(timeout=15000)
|
||||
public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception {
|
||||
electors[0].ensureParentZNode();
|
||||
|
||||
stopServer();
|
||||
ActiveStandbyElectorTestUtil.waitForElectorState(
|
||||
null, electors[0], State.NEUTRAL);
|
||||
startServer();
|
||||
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
|
||||
// Have to sleep to allow time for the clients to reconnect.
|
||||
Thread.sleep(2000);
|
||||
Mockito.verify(cbs[0], Mockito.never()).becomeActive();
|
||||
Mockito.verify(cbs[1], Mockito.never()).becomeActive();
|
||||
checkFatalsAndReset();
|
||||
}
|
||||
}
|
||||
|
@ -19,7 +19,6 @@
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
||||
import org.apache.commons.logging.impl.Log4JLogger;
|
||||
@ -32,12 +31,11 @@
|
||||
import org.apache.zookeeper.ZooKeeper;
|
||||
import org.apache.zookeeper.data.Stat;
|
||||
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
|
||||
import org.apache.zookeeper.test.ClientBase;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
public class TestZKFailoverController extends ClientBase {
|
||||
public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||
private Configuration conf;
|
||||
private MiniZKFCCluster cluster;
|
||||
|
||||
@ -63,13 +61,6 @@ public class TestZKFailoverController extends ClientBase {
|
||||
((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
// build.test.dir is used by zookeeper
|
||||
new File(System.getProperty("build.test.dir", "build")).mkdirs();
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setupConfAndServices() {
|
||||
conf = new Configuration();
|
||||
|
@ -17,15 +17,10 @@
|
||||
*/
|
||||
package org.apache.hadoop.ha;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.management.ObjectName;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.zookeeper.test.ClientBase;
|
||||
import org.apache.zookeeper.test.JMXEnv;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
@ -39,7 +34,7 @@
|
||||
* failovers. While doing so, ensures that a fake "shared resource"
|
||||
* (simulating the shared edits dir) is only owned by one service at a time.
|
||||
*/
|
||||
public class TestZKFailoverControllerStress extends ClientBase {
|
||||
public class TestZKFailoverControllerStress extends ClientBaseWithFixes {
|
||||
|
||||
private static final int STRESS_RUNTIME_SECS = 30;
|
||||
private static final int EXTRA_TIMEOUT_SECS = 10;
|
||||
@ -47,13 +42,6 @@ public class TestZKFailoverControllerStress extends ClientBase {
|
||||
private Configuration conf;
|
||||
private MiniZKFCCluster cluster;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
// build.test.dir is used by zookeeper
|
||||
new File(System.getProperty("build.test.dir", "build")).mkdirs();
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setupConfAndServices() throws Exception {
|
||||
conf = new Configuration();
|
||||
@ -67,22 +55,6 @@ public void stopCluster() throws Exception {
|
||||
cluster.stop();
|
||||
}
|
||||
|
||||
/**
|
||||
* ZK seems to have a bug when we muck with its sessions
|
||||
* behind its back, causing disconnects, etc. This bug
|
||||
* ends up leaving JMX beans around at the end of the test,
|
||||
* and ClientBase's teardown method will throw an exception
|
||||
* if it finds JMX beans leaked. So, clear them out there
|
||||
* to workaround the ZK bug. See ZOOKEEPER-1438.
|
||||
*/
|
||||
@After
|
||||
public void clearZKJMX() throws Exception {
|
||||
Set<ObjectName> names = JMXEnv.ensureAll();
|
||||
for (ObjectName n : names) {
|
||||
JMXEnv.conn().unregisterMBean(n);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simply fail back and forth between two services for the
|
||||
* configured amount of time, via expiring their ZK sessions.
|
||||
|
@ -19,12 +19,12 @@
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.ha.ClientBaseWithFixes;
|
||||
import org.apache.hadoop.ha.NodeFencer;
|
||||
import org.apache.hadoop.ha.ZKFailoverController;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
|
||||
@ -36,7 +36,6 @@
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
|
||||
import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread;
|
||||
import org.apache.zookeeper.test.ClientBase;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
@ -44,20 +43,13 @@
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
|
||||
public class TestDFSZKFailoverController extends ClientBase {
|
||||
public class TestDFSZKFailoverController extends ClientBaseWithFixes {
|
||||
private Configuration conf;
|
||||
private MiniDFSCluster cluster;
|
||||
private TestContext ctx;
|
||||
private ZKFCThread thr1, thr2;
|
||||
private FileSystem fs;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
// build.test.dir is used by zookeeper
|
||||
new File(System.getProperty("build.test.dir", "build")).mkdirs();
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
conf = new Configuration();
|
||||
|
Loading…
Reference in New Issue
Block a user