HADOOP-9220. Unnecessary transition to standby in ActiveStandbyElector. Contributed by Tom White and Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1482401 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2013-05-14 15:37:12 +00:00
parent e29170e771
commit d5a6e764dc
4 changed files with 23 additions and 11 deletions

View File

@ -720,6 +720,9 @@ Release 2.0.5-beta - UNRELEASED
HADOOP-9307. BufferedFSInputStream.read returns wrong results HADOOP-9307. BufferedFSInputStream.read returns wrong results
after certain seeks. (todd) after certain seeks. (todd)
HADOOP-9220. Unnecessary transition to standby in ActiveStandbyElector.
(tom and todd via todd)
Release 2.0.4-alpha - 2013-04-25 Release 2.0.4-alpha - 2013-04-25
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -159,6 +159,7 @@ static enum State {
private int createRetryCount = 0; private int createRetryCount = 0;
private int statRetryCount = 0; private int statRetryCount = 0;
private ZooKeeper zkClient; private ZooKeeper zkClient;
private WatcherWithClientRef watcher;
private ConnectionState zkConnectionState = ConnectionState.TERMINATED; private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
private final ActiveStandbyElectorCallback appClient; private final ActiveStandbyElectorCallback appClient;
@ -246,6 +247,11 @@ public synchronized void joinElection(byte[] data)
if (data == null) { if (data == null) {
throw new HadoopIllegalArgumentException("data cannot be null"); throw new HadoopIllegalArgumentException("data cannot be null");
} }
if (wantToBeInElection) {
LOG.info("Already in election. Not re-connecting.");
return;
}
appData = new byte[data.length]; appData = new byte[data.length];
System.arraycopy(data, 0, appData, 0, data.length); System.arraycopy(data, 0, appData, 0, data.length);
@ -615,7 +621,7 @@ protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
// watcher after constructing ZooKeeper, we may miss that event. Instead, // watcher after constructing ZooKeeper, we may miss that event. Instead,
// we construct the watcher first, and have it block any events it receives // we construct the watcher first, and have it block any events it receives
// before we can set its ZooKeeper reference. // before we can set its ZooKeeper reference.
WatcherWithClientRef watcher = new WatcherWithClientRef(); watcher = new WatcherWithClientRef();
ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
watcher.setZooKeeperRef(zk); watcher.setZooKeeperRef(zk);
@ -753,6 +759,7 @@ private void createConnection() throws IOException, KeeperException {
e); e);
} }
zkClient = null; zkClient = null;
watcher = null;
} }
zkClient = getNewZooKeeper(); zkClient = getNewZooKeeper();
LOG.debug("Created new connection for " + this); LOG.debug("Created new connection for " + this);
@ -765,12 +772,14 @@ void terminateConnection() {
LOG.debug("Terminating ZK connection for " + this); LOG.debug("Terminating ZK connection for " + this);
ZooKeeper tempZk = zkClient; ZooKeeper tempZk = zkClient;
zkClient = null; zkClient = null;
watcher = null;
try { try {
tempZk.close(); tempZk.close();
} catch(InterruptedException e) { } catch(InterruptedException e) {
LOG.warn(e); LOG.warn(e);
} }
zkConnectionState = ConnectionState.TERMINATED; zkConnectionState = ConnectionState.TERMINATED;
wantToBeInElection = false;
} }
private void reset() { private void reset() {
@ -914,7 +923,7 @@ private void createLockNodeAsync() {
private void monitorLockNodeAsync() { private void monitorLockNodeAsync() {
zkClient.exists(zkLockFilePath, zkClient.exists(zkLockFilePath,
new WatcherWithClientRef(zkClient), this, watcher, this,
zkClient); zkClient);
} }
@ -1015,13 +1024,6 @@ private final class WatcherWithClientRef implements Watcher {
* Latch used to wait until the reference to ZooKeeper is set. * Latch used to wait until the reference to ZooKeeper is set.
*/ */
private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
private WatcherWithClientRef() {
}
private WatcherWithClientRef(ZooKeeper zk) {
setZooKeeperRef(zk);
}
/** /**
* Waits for the next event from ZooKeeper to arrive. * Waits for the next event from ZooKeeper to arrive.

View File

@ -49,6 +49,7 @@ class DummyHAService extends HAServiceTarget {
DummySharedResource sharedResource; DummySharedResource sharedResource;
public int fenceCount = 0; public int fenceCount = 0;
public int activeTransitionCount = 0;
static ArrayList<DummyHAService> instances = Lists.newArrayList(); static ArrayList<DummyHAService> instances = Lists.newArrayList();
int index; int index;
@ -139,6 +140,7 @@ public void monitorHealth() throws HealthCheckFailedException,
@Override @Override
public void transitionToActive(StateChangeRequestInfo req) throws ServiceFailedException, public void transitionToActive(StateChangeRequestInfo req) throws ServiceFailedException,
AccessControlException, IOException { AccessControlException, IOException {
activeTransitionCount++;
checkUnreachable(); checkUnreachable();
if (failToBecomeActive) { if (failToBecomeActive) {
throw new ServiceFailedException("injected failure"); throw new ServiceFailedException("injected failure");

View File

@ -422,7 +422,7 @@ public void testCedeActive() throws Exception {
} }
} }
@Test(timeout=15000) @Test(timeout=25000)
public void testGracefulFailover() throws Exception { public void testGracefulFailover() throws Exception {
try { try {
cluster.start(); cluster.start();
@ -430,11 +430,16 @@ public void testGracefulFailover() throws Exception {
cluster.waitForActiveLockHolder(0); cluster.waitForActiveLockHolder(0);
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(1); cluster.waitForActiveLockHolder(1);
cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(0); cluster.waitForActiveLockHolder(0);
Thread.sleep(10000); // allow to quiesce
assertEquals(0, cluster.getService(0).fenceCount); assertEquals(0, cluster.getService(0).fenceCount);
assertEquals(0, cluster.getService(1).fenceCount); assertEquals(0, cluster.getService(1).fenceCount);
assertEquals(2, cluster.getService(0).activeTransitionCount);
assertEquals(1, cluster.getService(1).activeTransitionCount);
} finally { } finally {
cluster.stop(); cluster.stop();
} }