HADOOP-7991. HA: the FailoverController should check the standby is ready before failing over. Contributed by Eli Collins
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1239774 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4d779e088a
commit
4324e1bcd7
@ -39,3 +39,6 @@ HADOOP-7983. HA: failover should be able to pass args to fencers. (eli)
|
||||
|
||||
HADOOP-7938. HA: the FailoverController should optionally fence the
|
||||
active during failover. (eli)
|
||||
|
||||
HADOOP-7991. HA: the FailoverController should check the standby is
|
||||
ready before failing over. (eli)
|
||||
|
@ -46,12 +46,19 @@ public class FailoverController {
|
||||
* failover to, eg to prevent failing over to a service (eg due
|
||||
* to it being inaccessible, already active, not healthy, etc).
|
||||
*
|
||||
* An option to ignore toSvc if it claims it is not ready to
|
||||
* become active is provided in case performing a failover will
|
||||
* allow it to become active, eg because it triggers a log roll
|
||||
* so the standby can learn about new blocks and leave safemode.
|
||||
*
|
||||
* @param toSvc service to make active
|
||||
* @param toSvcName name of service to make active
|
||||
* @param forceActive ignore toSvc if it reports that it is not ready
|
||||
* @throws FailoverFailedException if we should avoid failover
|
||||
*/
|
||||
private static void preFailoverChecks(HAServiceProtocol toSvc,
|
||||
InetSocketAddress toSvcAddr)
|
||||
InetSocketAddress toSvcAddr,
|
||||
boolean forceActive)
|
||||
throws FailoverFailedException {
|
||||
HAServiceState toSvcState;
|
||||
try {
|
||||
@ -74,7 +81,17 @@ private static void preFailoverChecks(HAServiceProtocol toSvc,
|
||||
throw new FailoverFailedException(
|
||||
"Got an IO exception", e);
|
||||
}
|
||||
// TODO(HA): ask toSvc if it's capable. Eg not in SM.
|
||||
try {
|
||||
if (!toSvc.readyToBecomeActive()) {
|
||||
if (!forceActive) {
|
||||
throw new FailoverFailedException(
|
||||
toSvcAddr + " is not ready to become active");
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new FailoverFailedException(
|
||||
"Got an IO exception", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -87,16 +104,19 @@ private static void preFailoverChecks(HAServiceProtocol toSvc,
|
||||
* @param toSvcAddr addr of the service to make active
|
||||
* @param fencer for fencing fromSvc
|
||||
* @param forceFence to fence fromSvc even if not strictly necessary
|
||||
* @param forceActive try to make toSvc active even if it is not ready
|
||||
* @throws FailoverFailedException if the failover fails
|
||||
*/
|
||||
public static void failover(HAServiceProtocol fromSvc,
|
||||
InetSocketAddress fromSvcAddr,
|
||||
HAServiceProtocol toSvc,
|
||||
InetSocketAddress toSvcAddr,
|
||||
NodeFencer fencer, boolean forceFence)
|
||||
NodeFencer fencer,
|
||||
boolean forceFence,
|
||||
boolean forceActive)
|
||||
throws FailoverFailedException {
|
||||
Preconditions.checkArgument(fencer != null, "failover requires a fencer");
|
||||
preFailoverChecks(toSvc, toSvcAddr);
|
||||
preFailoverChecks(toSvc, toSvcAddr, forceActive);
|
||||
|
||||
// Try to make fromSvc standby
|
||||
boolean tryFence = true;
|
||||
@ -145,7 +165,9 @@ public static void failover(HAServiceProtocol fromSvc,
|
||||
try {
|
||||
// Unconditionally fence toSvc in case it is still trying to
|
||||
// become active, eg we timed out waiting for its response.
|
||||
failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true);
|
||||
// Unconditionally force fromSvc to become active since it
|
||||
// was previously active when we initiated failover.
|
||||
failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true, true);
|
||||
} catch (FailoverFailedException ffe) {
|
||||
msg += ". Failback to " + fromSvcAddr +
|
||||
" failed (" + ffe.getMessage() + ")";
|
||||
|
@ -47,7 +47,8 @@
|
||||
|
||||
public abstract class HAAdmin extends Configured implements Tool {
|
||||
|
||||
private static final String FORCEFENCE = "forcefence";
|
||||
private static final String FORCEFENCE = "forcefence";
|
||||
private static final String FORCEACTIVE = "forceactive";
|
||||
|
||||
private static Map<String, UsageInfo> USAGE =
|
||||
ImmutableMap.<String, UsageInfo>builder()
|
||||
@ -56,9 +57,11 @@ public abstract class HAAdmin extends Configured implements Tool {
|
||||
.put("-transitionToStandby",
|
||||
new UsageInfo("<host:port>", "Transitions the daemon into Standby state"))
|
||||
.put("-failover",
|
||||
new UsageInfo("[--"+FORCEFENCE+"] <host:port> <host:port>",
|
||||
new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <host:port> <host:port>",
|
||||
"Failover from the first daemon to the second.\n" +
|
||||
"Unconditionally fence services if the "+FORCEFENCE+" option is used."))
|
||||
"Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
|
||||
"Try to failover to the target service even if it is not ready if the " +
|
||||
FORCEACTIVE + " option is used."))
|
||||
.put("-getServiceState",
|
||||
new UsageInfo("<host:port>", "Returns the state of the daemon"))
|
||||
.put("-checkHealth",
|
||||
@ -124,12 +127,14 @@ private int failover(final String[] argv)
|
||||
throws IOException, ServiceFailedException {
|
||||
Configuration conf = getConf();
|
||||
boolean forceFence = false;
|
||||
boolean forceActive = false;
|
||||
|
||||
Options failoverOpts = new Options();
|
||||
// "-failover" isn't really an option but we need to add
|
||||
// it to appease CommandLineParser
|
||||
failoverOpts.addOption("failover", false, "failover");
|
||||
failoverOpts.addOption(FORCEFENCE, false, "force fencing");
|
||||
failoverOpts.addOption(FORCEACTIVE, false, "force failover");
|
||||
|
||||
CommandLineParser parser = new GnuParser();
|
||||
CommandLine cmd;
|
||||
@ -137,6 +142,7 @@ private int failover(final String[] argv)
|
||||
try {
|
||||
cmd = parser.parse(failoverOpts, argv);
|
||||
forceFence = cmd.hasOption(FORCEFENCE);
|
||||
forceActive = cmd.hasOption(FORCEACTIVE);
|
||||
} catch (ParseException pe) {
|
||||
errOut.println("failover: incorrect arguments");
|
||||
printUsage(errOut, "-failover");
|
||||
@ -172,7 +178,7 @@ private int failover(final String[] argv)
|
||||
|
||||
try {
|
||||
FailoverController.failover(proto1, addr1, proto2, addr2,
|
||||
fencer, forceFence);
|
||||
fencer, forceFence, forceActive);
|
||||
out.println("Failover from "+args[0]+" to "+args[1]+" successful");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
errOut.println("Failover failed: " + ffe.getLocalizedMessage());
|
||||
|
@ -112,4 +112,15 @@ public void transitionToStandby() throws ServiceFailedException,
|
||||
* if other errors happen
|
||||
*/
|
||||
public HAServiceState getServiceState() throws IOException;
|
||||
|
||||
/**
|
||||
* Return true if the service is capable and ready to transition
|
||||
* from the standby state to the active state.
|
||||
*
|
||||
* @return true if the service is ready to become active, false otherwise.
|
||||
* @throws IOException
|
||||
* if other errors happen
|
||||
*/
|
||||
public boolean readyToBecomeActive() throws ServiceFailedException,
|
||||
IOException;
|
||||
}
|
||||
|
@ -79,6 +79,11 @@ public void transitionToStandby() throws ServiceFailedException, IOException {
|
||||
public HAServiceState getServiceState() throws IOException {
|
||||
return state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -88,13 +93,13 @@ public void testFailoverAndFailback() throws Exception {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
|
||||
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
|
||||
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false);
|
||||
FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false, false);
|
||||
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
|
||||
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
|
||||
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
|
||||
@ -106,7 +111,7 @@ public void testFailoverFromStandbyToStandby() throws Exception {
|
||||
DummyService svc2 = new DummyService(HAServiceState.STANDBY);
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
|
||||
}
|
||||
@ -118,7 +123,7 @@ public void testFailoverFromActiveToActive() throws Exception {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Can't failover to an already active service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -128,6 +133,33 @@ public void testFailoverFromActiveToActive() throws Exception {
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFailoverToUnreadyService() throws Exception {
|
||||
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
|
||||
DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
|
||||
@Override
|
||||
public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Can't failover to a service that's not ready");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
}
|
||||
|
||||
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
|
||||
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
|
||||
|
||||
// Forcing it means we ignore readyToBecomeActive
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, true);
|
||||
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception {
|
||||
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
|
||||
@ -140,7 +172,7 @@ public void monitorHealth() throws HealthCheckFailedException {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failover to unhealthy service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -162,7 +194,7 @@ public void transitionToStandby() throws ServiceFailedException {
|
||||
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
} catch (FailoverFailedException ffe) {
|
||||
fail("Faulty active prevented failover");
|
||||
}
|
||||
@ -187,7 +219,7 @@ public void transitionToStandby() throws ServiceFailedException {
|
||||
|
||||
AlwaysFailFencer.fenceCalled = 0;
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failed over even though fencing failed");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -207,7 +239,7 @@ public void testFencingFailureDuringFailover() throws Exception {
|
||||
|
||||
AlwaysFailFencer.fenceCalled = 0;
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
|
||||
fail("Failed over even though fencing requested and failed");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -238,7 +270,7 @@ public void testFailoverFromNonExistantServiceWithFencer() throws Exception {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
} catch (FailoverFailedException ffe) {
|
||||
fail("Non-existant active prevented failover");
|
||||
}
|
||||
@ -254,7 +286,7 @@ public void testFailoverToNonExistantServiceFails() throws Exception {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failed over to a non-existant standby");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -275,7 +307,7 @@ public void transitionToActive() throws ServiceFailedException {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failover to already active service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -300,7 +332,7 @@ public void transitionToActive() throws ServiceFailedException {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
|
||||
fail("Failed over to service that won't transition to active");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -325,7 +357,7 @@ public void transitionToActive() throws ServiceFailedException, IOException {
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failed over to service that won't transition to active");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -352,7 +384,7 @@ public void transitionToActive() throws ServiceFailedException, IOException {
|
||||
AlwaysFailFencer.fenceCalled = 0;
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failed over to service that won't transition to active");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
@ -383,7 +415,7 @@ public void transitionToActive() throws ServiceFailedException {
|
||||
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
|
||||
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
|
||||
fail("Failover to already active service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -31,6 +31,7 @@
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.base.Joiner;
|
||||
@ -44,8 +45,9 @@ public class TestHAAdmin {
|
||||
private HAServiceProtocol mockProtocol;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
public void setup() throws IOException {
|
||||
mockProtocol = Mockito.mock(HAServiceProtocol.class);
|
||||
when(mockProtocol.readyToBecomeActive()).thenReturn(true);
|
||||
tool = new HAAdmin() {
|
||||
@Override
|
||||
protected HAServiceProtocol getProtocol(String target) throws IOException {
|
||||
@ -130,6 +132,15 @@ public void testFailoverWithFencerConfiguredAndForce() throws Exception {
|
||||
assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forcefence"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFailoverWithForceActive() throws Exception {
|
||||
Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
|
||||
Configuration conf = new Configuration();
|
||||
conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
|
||||
tool.setConf(conf);
|
||||
assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forceactive"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFailoverWithInvalidFenceArg() throws Exception {
|
||||
Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
|
||||
|
@ -929,6 +929,13 @@ synchronized HAServiceState getServiceState() {
|
||||
return state.getServiceState();
|
||||
}
|
||||
|
||||
synchronized boolean readyToBecomeActive() throws ServiceFailedException {
|
||||
if (!haEnabled) {
|
||||
throw new ServiceFailedException("HA for namenode is not enabled");
|
||||
}
|
||||
return !isInSafeMode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Class used as expose {@link NameNode} as context to {@link HAState}
|
||||
|
@ -1007,6 +1007,11 @@ public synchronized HAServiceState getServiceState() {
|
||||
return nn.getServiceState();
|
||||
}
|
||||
|
||||
@Override // HAServiceProtocol
|
||||
public synchronized boolean readyToBecomeActive() throws ServiceFailedException {
|
||||
return nn.readyToBecomeActive();
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify version.
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user