HADOOP-7924. 
FailoverController for client-based configuration. Contributed by Eli Collins

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1227836 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Eli Collins 2012-01-05 21:02:59 +00:00
parent 3187a3ae05
commit fb9cdcfa60
9 changed files with 457 additions and 38 deletions

View File

@ -24,3 +24,5 @@ HADOOP-7925. Add interface and update CLI to query current state to
HADOOP-7932. Make client connection retries on socket time outs configurable. HADOOP-7932. Make client connection retries on socket time outs configurable.
(Uma Maheswara Rao G via todd) (Uma Maheswara Rao G via todd)
HADOOP-7924. FailoverController for client-based configuration (eli)

View File

@ -36,6 +36,7 @@ function print_usage(){
echo " classpath prints the class path needed to get the" echo " classpath prints the class path needed to get the"
echo " Hadoop jar and the required libraries" echo " Hadoop jar and the required libraries"
echo " daemonlog get/set the log level for each daemon" echo " daemonlog get/set the log level for each daemon"
echo " haadmin run a HA admin client"
echo " or" echo " or"
echo " CLASSNAME run the class named CLASSNAME" echo " CLASSNAME run the class named CLASSNAME"
echo "" echo ""
@ -95,6 +96,10 @@ case $COMMAND in
CLASS=org.apache.hadoop.tools.HadoopArchives CLASS=org.apache.hadoop.tools.HadoopArchives
CLASSPATH=${CLASSPATH}:${TOOL_PATH} CLASSPATH=${CLASSPATH}:${TOOL_PATH}
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
elif [ "$COMMAND" = "haadmin" ] ; then
CLASS=org.apache.hadoop.ha.HAAdmin
CLASSPATH=${CLASSPATH}:${TOOL_PATH}
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
elif [[ "$COMMAND" = -* ]] ; then elif [[ "$COMMAND" = -* ]] ; then
# class and package names cannot begin with a - # class and package names cannot begin with a -
echo "Error: No command named \`$COMMAND' was found. Perhaps you meant \`hadoop ${COMMAND#-}'" echo "Error: No command named \`$COMMAND' was found. Perhaps you meant \`hadoop ${COMMAND#-}'"

View File

@ -0,0 +1,132 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ha;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
/**
* The FailOverController is responsible for electing an active service
* on startup or when the current active is changing (eg due to failure),
* monitoring the health of a service, and performing a fail-over when a
* new active service is either manually selected by a user or elected.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class FailoverController {
private static final Log LOG = LogFactory.getLog(FailoverController.class);
/**
* Perform pre-failover checks on the given service we plan to
* failover to, eg to prevent failing over to a service (eg due
* to it being inaccessible, already active, not healthy, etc).
*
* @param toSvc service to make active
* @param toSvcName name of service to make active
* @throws FailoverFailedException if we should avoid failover
*/
private static void preFailoverChecks(HAServiceProtocol toSvc,
String toSvcName)
throws FailoverFailedException {
HAServiceState toSvcState;
try {
toSvcState = toSvc.getServiceState();
} catch (Exception e) {
String msg = "Unable to get service state for " + toSvcName;
LOG.error(msg, e);
throw new FailoverFailedException(msg, e);
}
if (!toSvcState.equals(HAServiceState.STANDBY)) {
throw new FailoverFailedException(
"Can't failover to an active service");
}
try {
toSvc.monitorHealth();
} catch (HealthCheckFailedException hce) {
throw new FailoverFailedException(
"Can't failover to an unhealthy service", hce);
}
// TODO(HA): ask toSvc if it's capable. Eg not in SM.
}
/**
* Failover from service 1 to service 2. If the failover fails
* then try to failback.
*
* @param fromSvc currently active service
* @param fromSvcName name of currently active service
* @param toSvc service to make active
* @param toSvcName name of service to make active
* @throws FailoverFailedException if the failover fails
*/
public static void failover(HAServiceProtocol fromSvc, String fromSvcName,
HAServiceProtocol toSvc, String toSvcName)
throws FailoverFailedException {
preFailoverChecks(toSvc, toSvcName);
// Try to make fromSvc standby
try {
fromSvc.transitionToStandby();
} catch (ServiceFailedException sfe) {
LOG.warn("Unable to make " + fromSvcName + " standby (" +
sfe.getMessage() + ")");
} catch (Exception e) {
LOG.warn("Unable to make " + fromSvcName +
" standby (unable to connect)", e);
// TODO(HA): fence fromSvc and unfence on failback
}
// Try to make toSvc active
boolean failed = false;
Throwable cause = null;
try {
toSvc.transitionToActive();
} catch (ServiceFailedException sfe) {
LOG.error("Unable to make " + toSvcName + " active (" +
sfe.getMessage() + "). Failing back");
failed = true;
cause = sfe;
} catch (Exception e) {
LOG.error("Unable to make " + toSvcName +
" active (unable to connect). Failing back", e);
failed = true;
cause = e;
}
// Try to failback if we failed to make toSvc active
if (failed) {
String msg = "Unable to failover to " + toSvcName;
try {
fromSvc.transitionToActive();
} catch (ServiceFailedException sfe) {
msg = "Failback to " + fromSvcName + " failed (" +
sfe.getMessage() + ")";
LOG.fatal(msg);
} catch (Exception e) {
msg = "Failback to " + fromSvcName + " failed (unable to connect)";
LOG.fatal(msg);
}
throw new FailoverFailedException(msg, cause);
}
}
}

View File

@ -0,0 +1,38 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ha;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
/**
* Exception thrown to indicate service failover has failed.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class FailoverFailedException extends Exception {
private static final long serialVersionUID = 1L;
public FailoverFailedException(final String message) {
super(message);
}
public FailoverFailedException(String message, Throwable cause) {
super(message, cause);
}
}

View File

@ -24,7 +24,6 @@
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.Tool;
@ -46,6 +45,9 @@ public class HAAdmin extends Configured implements Tool {
new UsageInfo("<host:port>", "Transitions the daemon into Active state")) new UsageInfo("<host:port>", "Transitions the daemon into Active state"))
.put("-transitionToStandby", .put("-transitionToStandby",
new UsageInfo("<host:port>", "Transitions the daemon into Standby state")) new UsageInfo("<host:port>", "Transitions the daemon into Standby state"))
.put("-failover",
new UsageInfo("<host:port> <host:port>",
"Failover from the first daemon to the second"))
.put("-getServiceState", .put("-getServiceState",
new UsageInfo("<host:port>", "Returns the state of the daemon")) new UsageInfo("<host:port>", "Returns the state of the daemon"))
.put("-checkHealth", .put("-checkHealth",
@ -94,7 +96,6 @@ private int transitionToActive(final String[] argv)
return 0; return 0;
} }
private int transitionToStandby(final String[] argv) private int transitionToStandby(final String[] argv)
throws IOException, ServiceFailedException { throws IOException, ServiceFailedException {
if (argv.length != 2) { if (argv.length != 2) {
@ -107,7 +108,27 @@ private int transitionToStandby(final String[] argv)
proto.transitionToStandby(); proto.transitionToStandby();
return 0; return 0;
} }
private int failover(final String[] argv)
throws IOException, ServiceFailedException {
if (argv.length != 3) {
errOut.println("failover: incorrect number of arguments");
printUsage(errOut, "-failover");
return -1;
}
HAServiceProtocol proto1 = getProtocol(argv[1]);
HAServiceProtocol proto2 = getProtocol(argv[2]);
try {
FailoverController.failover(proto1, argv[1], proto2, argv[2]);
out.println("Failover from "+argv[1]+" to "+argv[2]+" successful");
} catch (FailoverFailedException ffe) {
errOut.println("Failover failed: " + ffe.getLocalizedMessage());
return 1;
}
return 0;
}
private int checkHealth(final String[] argv) private int checkHealth(final String[] argv)
throws IOException, ServiceFailedException { throws IOException, ServiceFailedException {
if (argv.length != 2) { if (argv.length != 2) {
@ -171,6 +192,8 @@ public int run(String[] argv) throws Exception {
return transitionToActive(argv); return transitionToActive(argv);
} else if ("-transitionToStandby".equals(cmd)) { } else if ("-transitionToStandby".equals(cmd)) {
return transitionToStandby(argv); return transitionToStandby(argv);
} else if ("-failover".equals(cmd)) {
return failover(argv);
} else if ("-getServiceState".equals(cmd)) { } else if ("-getServiceState".equals(cmd)) {
return getServiceState(argv); return getServiceState(argv);
} else if ("-checkHealth".equals(cmd)) { } else if ("-checkHealth".equals(cmd)) {

View File

@ -21,34 +21,17 @@
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
/** /**
* Exception thrown to indicate that health check of a service * Exception thrown to indicate that health check of a service failed.
* failed.
*/ */
@InterfaceAudience.Public @InterfaceAudience.Public
@InterfaceStability.Evolving @InterfaceStability.Evolving
public class HealthCheckFailedException extends Exception { public class HealthCheckFailedException extends Exception {
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
/**
* Constructs exception with the specified detail message.
* @param message the detail message (which is saved for later retrieval
* by the {@link #getMessage()} method).
*/
public HealthCheckFailedException(final String message) { public HealthCheckFailedException(final String message) {
super(message); super(message);
} }
/**
* Constructs a new exception with the specified detail message and
* cause.
*
* @param message the detail message (which is saved for later retrieval
* by the {@link #getMessage()} method).
* @param cause the cause (which is saved for later retrieval by the
* {@link #getCause()} method). (A <tt>null</tt> value is
* permitted, and indicates that the cause is nonexistent or
* unknown.)
*/
public HealthCheckFailedException(String message, Throwable cause) { public HealthCheckFailedException(String message, Throwable cause) {
super(message, cause); super(message, cause);
} }

View File

@ -30,27 +30,11 @@
public class ServiceFailedException extends Exception { public class ServiceFailedException extends Exception {
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
/**
* Constructs exception with the specified detail message.
* @param message the detail message (which is saved for later retrieval
* by the {@link #getMessage()} method).
*/
public ServiceFailedException(final String message) { public ServiceFailedException(final String message) {
super(message); super(message);
} }
/**
* Constructs a new exception with the specified detail message and
* cause.
*
* @param message the detail message (which is saved for later retrieval
* by the {@link #getMessage()} method).
* @param cause the cause (which is saved for later retrieval by the
* {@link #getCause()} method). (A <tt>null</tt> value is
* permitted, and indicates that the cause is nonexistent or
* unknown.)
*/
public ServiceFailedException(String message, Throwable cause) { public ServiceFailedException(String message, Throwable cause) {
super(message, cause); super(message, cause);
} }
} }

View File

@ -0,0 +1,247 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ha;
import java.io.IOException;
import java.net.InetSocketAddress;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils;
import org.junit.Test;
import static org.junit.Assert.*;
public class TestFailoverController {
private class DummyService implements HAServiceProtocol {
HAServiceState state;
DummyService(HAServiceState state) {
this.state = state;
}
@Override
public long getProtocolVersion(String protocol, long clientVersion)
throws IOException {
return 0;
}
@Override
public ProtocolSignature getProtocolSignature(String protocol,
long clientVersion, int clientMethodsHash) throws IOException {
return null;
}
@Override
public void monitorHealth() throws HealthCheckFailedException {
// Do nothing
}
@Override
public void transitionToActive() throws ServiceFailedException {
state = HAServiceState.ACTIVE;
}
@Override
public void transitionToStandby() throws ServiceFailedException {
state = HAServiceState.STANDBY;
}
@Override
public HAServiceState getServiceState() {
return state;
}
}
@Test
public void testFailoverAndFailback() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
DummyService svc2 = new DummyService(HAServiceState.STANDBY);
FailoverController.failover(svc1, "svc1", svc2, "svc2");
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
FailoverController.failover(svc2, "svc2", svc1, "svc1");
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
}
@Test
public void testFailoverFromStandbyToStandby() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.STANDBY);
DummyService svc2 = new DummyService(HAServiceState.STANDBY);
FailoverController.failover(svc1, "svc1", svc2, "svc2");
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@Test
public void testFailoverFromActiveToActive() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
DummyService svc2 = new DummyService(HAServiceState.ACTIVE);
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
fail("Can't failover to an already active service");
} catch (FailoverFailedException ffe) {
// Expected
}
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@Test
public void testFailoverToUnhealthyServiceFails() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
@Override
public void monitorHealth() throws HealthCheckFailedException {
throw new HealthCheckFailedException("Failed!");
}
};
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
fail("Failover to unhealthy service");
} catch (FailoverFailedException ffe) {
// Expected
}
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
}
@Test
public void testFailoverFromFaultyServiceSucceeds() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE) {
@Override
public void transitionToStandby() throws ServiceFailedException {
throw new ServiceFailedException("Failed!");
}
};
DummyService svc2 = new DummyService(HAServiceState.STANDBY);
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
} catch (FailoverFailedException ffe) {
fail("Faulty active prevented failover");
}
// svc1 still thinks they're active, that's OK, we'll fence them
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
private HAServiceProtocol getProtocol(String target)
throws IOException {
InetSocketAddress addr = NetUtils.createSocketAddr(target);
Configuration conf = new Configuration();
// Lower the timeout so we quickly fail to connect
conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 1);
return (HAServiceProtocol)RPC.getProxy(
HAServiceProtocol.class, HAServiceProtocol.versionID, addr, conf);
}
@Test
public void testFailoverFromNonExistantServiceSucceeds() throws Exception {
HAServiceProtocol svc1 = getProtocol("localhost:1234");
DummyService svc2 = new DummyService(HAServiceState.STANDBY);
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
} catch (FailoverFailedException ffe) {
fail("Non-existant active prevented failover");
}
// Don't check svc1 (we can't reach it, but that's OK, we'll fence)
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@Test
public void testFailoverToNonExistantServiceFails() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
HAServiceProtocol svc2 = getProtocol("localhost:1234");
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
fail("Failed over to a non-existant standby");
} catch (FailoverFailedException ffe) {
// Expected
}
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
}
@Test
public void testFailoverToFaultyServiceFailsbackOK() throws Exception {
DummyService svc1 = spy(new DummyService(HAServiceState.ACTIVE));
DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
@Override
public void transitionToActive() throws ServiceFailedException {
throw new ServiceFailedException("Failed!");
}
};
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
fail("Failover to already active service");
} catch (FailoverFailedException ffe) {
// Expected
}
// svc1 went standby then back to active
verify(svc1).transitionToStandby();
verify(svc1).transitionToActive();
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
}
@Test
public void testFailbackToFaultyServiceFails() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE) {
@Override
public void transitionToActive() throws ServiceFailedException {
throw new ServiceFailedException("Failed!");
}
};
DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
@Override
public void transitionToActive() throws ServiceFailedException {
throw new ServiceFailedException("Failed!");
}
};
try {
FailoverController.failover(svc1, "svc1", svc2, "svc2");
fail("Failover to already active service");
} catch (FailoverFailedException ffe) {
// Expected
}
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
}
}

View File

@ -100,6 +100,11 @@ public void testTransitionToStandby() throws Exception {
Mockito.verify(mockProtocol).transitionToStandby(); Mockito.verify(mockProtocol).transitionToStandby();
} }
@Test
public void testFailover() throws Exception {
assertEquals(0, runTool("-failover", "xxx", "yyy"));
}
@Test @Test
public void testGetServiceState() throws Exception { public void testGetServiceState() throws Exception {
assertEquals(0, runTool("-getServiceState", "xxx")); assertEquals(0, runTool("-getServiceState", "xxx"));