HADOOP-7896. HA: if both NNs are in Standby mode, client needs to try failing back and forth several times with sleeps. Contributed by Aaron T. Myers

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1214076 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2011-12-14 07:24:36 +00:00
parent a0fe4f476a
commit 9818091a66
7 changed files with 219 additions and 40 deletions

View File

@ -5,4 +5,8 @@ branch is merged.
------------------------------
HADOOP-7455. HA: Introduce HA Service Protocol Interface. (suresh)
HADOOP-7774. HA: Administrative CLI to control HA daemons. (todd)
HADOOP-7896. HA: if both NNs are in Standby mode, client needs to try failing
back and forth several times with sleeps. (atm)

View File

@ -24,11 +24,11 @@
import java.lang.reflect.Method;
import java.util.Collections;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.retry.RetryPolicy.RetryAction;
import org.apache.hadoop.util.ThreadUtil;
class RetryInvocationHandler implements InvocationHandler, Closeable {
public static final Log LOG = LogFactory.getLog(RetryInvocationHandler.class);
@ -85,14 +85,20 @@ public Object invoke(Object proxy, Method method, Object[] args)
.isAnnotationPresent(Idempotent.class);
RetryAction action = policy.shouldRetry(e, retries++, invocationFailoverCount,
isMethodIdempotent);
if (action == RetryAction.FAIL) {
if (action.action == RetryAction.RetryDecision.FAIL) {
LOG.warn("Exception while invoking " + method.getName()
+ " of " + currentProxy.getClass() + ". Not retrying.", e);
if (!method.getReturnType().equals(Void.TYPE)) {
throw e; // non-void methods can't fail without an exception
}
return null;
} else if (action == RetryAction.FAILOVER_AND_RETRY) {
} else { // retry or failover
if (action.delayMillis > 0) {
ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis);
}
if (action.action == RetryAction.RetryDecision.FAILOVER_AND_RETRY) {
LOG.warn("Exception while invoking " + method.getName()
+ " of " + currentProxy.getClass()
+ " after " + invocationFailoverCount + " fail over attempts."
@ -111,6 +117,7 @@ public Object invoke(Object proxy, Method method, Object[] args)
}
invocationFailoverCount++;
}
}
if(LOG.isDebugEnabled()) {
LOG.debug("Exception while invoking " + method.getName()
+ " of " + currentProxy.getClass() + ". Retrying.", e);

View File

@ -33,6 +33,8 @@
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.StandbyException;
import com.google.common.annotations.VisibleForTesting;
/**
* <p>
* A collection of useful implementations of {@link RetryPolicy}.
@ -42,6 +44,8 @@ public class RetryPolicies {
public static final Log LOG = LogFactory.getLog(RetryPolicies.class);
private static final Random RAND = new Random();
/**
* <p>
* Try once, and fail by re-throwing the exception.
@ -137,7 +141,14 @@ public static final RetryPolicy failoverOnNetworkException(int maxFailovers) {
public static final RetryPolicy failoverOnNetworkException(
RetryPolicy fallbackPolicy, int maxFailovers) {
return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers);
return failoverOnNetworkException(fallbackPolicy, maxFailovers, 0, 0);
}
public static final RetryPolicy failoverOnNetworkException(
RetryPolicy fallbackPolicy, int maxFailovers, long delayMillis,
long maxDelayBase) {
return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers,
delayMillis, maxDelayBase);
}
static class TryOnceThenFail implements RetryPolicy {
@ -176,12 +187,8 @@ public RetryAction shouldRetry(Exception e, int retries, int failovers,
if (retries >= maxRetries) {
throw e;
}
try {
timeUnit.sleep(calculateSleepTime(retries));
} catch (InterruptedException ie) {
// retry
}
return RetryAction.RETRY;
return new RetryAction(RetryAction.RetryDecision.RETRY,
timeUnit.toMillis(calculateSleepTime(retries)));
}
protected abstract long calculateSleepTime(int retries);
@ -268,7 +275,7 @@ public RetryAction shouldRetry(Exception e, int retries, int failovers,
}
static class ExponentialBackoffRetry extends RetryLimited {
private Random r = new Random();
public ExponentialBackoffRetry(
int maxRetries, long sleepTime, TimeUnit timeUnit) {
super(maxRetries, sleepTime, timeUnit);
@ -276,16 +283,19 @@ public ExponentialBackoffRetry(
@Override
protected long calculateSleepTime(int retries) {
return sleepTime*r.nextInt(1<<(retries+1));
return calculateExponentialTime(sleepTime, retries + 1);
}
}
/*
/**
* Fail over and retry in the case of:
* Remote StandbyException (server is up, but is not the active server)
* Immediate socket exceptions (e.g. no route to host, econnrefused)
* Socket exceptions after initial connection when operation is idempotent
*
* The first failover is immediate, while all subsequent failovers wait an
* exponentially-increasing random amount of time.
*
* Fail immediately in the case of:
* Socket exceptions after initial connection when operation is not idempotent
*
@ -295,11 +305,20 @@ static class FailoverOnNetworkExceptionRetry implements RetryPolicy {
private RetryPolicy fallbackPolicy;
private int maxFailovers;
private long delayMillis;
private long maxDelayBase;
public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy,
int maxFailovers) {
this(fallbackPolicy, maxFailovers, 0, 0);
}
public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy,
int maxFailovers, long delayMillis, long maxDelayBase) {
this.fallbackPolicy = fallbackPolicy;
this.maxFailovers = maxFailovers;
this.delayMillis = delayMillis;
this.maxDelayBase = maxDelayBase;
}
@Override
@ -314,8 +333,13 @@ public RetryAction shouldRetry(Exception e, int retries,
if (e instanceof ConnectException ||
e instanceof NoRouteToHostException ||
e instanceof UnknownHostException ||
e instanceof StandbyException) {
return RetryAction.FAILOVER_AND_RETRY;
e instanceof StandbyException ||
isWrappedStandbyException(e)) {
return new RetryAction(
RetryAction.RetryDecision.FAILOVER_AND_RETRY,
// retry immediately if this is our first failover, sleep otherwise
failovers == 0 ? 0 :
calculateExponentialTime(delayMillis, failovers, maxDelayBase));
} else if (e instanceof SocketException ||
e instanceof IOException) {
if (isMethodIdempotent) {
@ -330,4 +354,34 @@ public RetryAction shouldRetry(Exception e, int retries,
}
}
/**
* Return a value which is <code>time</code> increasing exponentially as a
* function of <code>retries</code>, +/- 0%-50% of that value, chosen
* randomly.
*
* @param time the base amount of time to work with
* @param retries the number of retries that have so occurred so far
* @param cap value at which to cap the base sleep time
* @return an amount of time to sleep
*/
@VisibleForTesting
public static long calculateExponentialTime(long time, int retries,
long cap) {
long baseTime = Math.min(time * ((long)1 << retries), cap);
return (long) (baseTime * (RAND.nextFloat() + 0.5));
}
private static long calculateExponentialTime(long time, int retries) {
return calculateExponentialTime(time, retries, Long.MAX_VALUE);
}
private static boolean isWrappedStandbyException(Exception e) {
if (!(e instanceof RemoteException)) {
return false;
}
Exception unwrapped = ((RemoteException)e).unwrapRemoteException(
StandbyException.class);
return unwrapped instanceof StandbyException;
}
}

View File

@ -19,7 +19,6 @@
import org.apache.hadoop.classification.InterfaceStability;
/**
* <p>
* Specifies a policy for retrying method failures.
@ -33,11 +32,34 @@ public interface RetryPolicy {
* Returned by {@link RetryPolicy#shouldRetry(Exception, int, int, boolean)}.
*/
@InterfaceStability.Evolving
public enum RetryAction {
public static class RetryAction {
// A few common retry policies, with no delays.
public static final RetryAction FAIL =
new RetryAction(RetryDecision.FAIL);
public static final RetryAction RETRY =
new RetryAction(RetryDecision.RETRY);
public static final RetryAction FAILOVER_AND_RETRY =
new RetryAction(RetryDecision.FAILOVER_AND_RETRY);
public final RetryDecision action;
public final long delayMillis;
public RetryAction(RetryDecision action) {
this(action, 0);
}
public RetryAction(RetryDecision action, long delayTime) {
this.action = action;
this.delayMillis = delayTime;
}
public enum RetryDecision {
FAIL,
RETRY,
FAILOVER_AND_RETRY
}
}
/**
* <p>

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceStability;
@InterfaceStability.Evolving
public class ThreadUtil {
private static final Log LOG = LogFactory.getLog(ThreadUtil.class);
/**
* Cause the current thread to sleep as close as possible to the provided
* number of milliseconds. This method will log and ignore any
* {@link InterrupedException} encountered.
*
* @param millis the number of milliseconds for the current thread to sleep
*/
public static void sleepAtLeastIgnoreInterrupts(long millis) {
long start = System.currentTimeMillis();
while (System.currentTimeMillis() - start < millis) {
long timeToSleep = millis -
(System.currentTimeMillis() - start);
try {
Thread.sleep(timeToSleep);
} catch (InterruptedException ie) {
LOG.warn("interrupted while sleeping", ie);
}
}
}
}

View File

@ -25,6 +25,7 @@
import org.apache.hadoop.io.retry.UnreliableImplementation.TypeOfExceptionToFailWith;
import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException;
import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.util.ThreadUtil;
import org.junit.Test;
public class TestFailoverProxy {
@ -267,4 +268,40 @@ public void testConcurrentMethodFailures() throws InterruptedException {
assertEquals("impl2", t2.result);
assertEquals(1, proxyProvider.getFailoversOccurred());
}
/**
* Ensure that when all configured services are throwing StandbyException
* that we fail over back and forth between them until one is no longer
* throwing StandbyException.
*/
@Test
public void testFailoverBetweenMultipleStandbys()
throws UnreliableException, StandbyException, IOException {
final long millisToSleep = 10000;
final UnreliableImplementation impl1 = new UnreliableImplementation("impl1",
TypeOfExceptionToFailWith.STANDBY_EXCEPTION);
FlipFlopProxyProvider proxyProvider = new FlipFlopProxyProvider(
UnreliableInterface.class,
impl1,
new UnreliableImplementation("impl2",
TypeOfExceptionToFailWith.STANDBY_EXCEPTION));
final UnreliableInterface unreliable = (UnreliableInterface)RetryProxy
.create(UnreliableInterface.class, proxyProvider,
RetryPolicies.failoverOnNetworkException(
RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000));
new Thread() {
@Override
public void run() {
ThreadUtil.sleepAtLeastIgnoreInterrupts(millisToSleep);
impl1.setIdentifier("renamed-impl1");
}
}.start();
String result = unreliable.failsIfIdentifierDoesntMatch("renamed-impl1");
assertEquals("renamed-impl1", result);
}
}

View File

@ -48,6 +48,10 @@ public UnreliableImplementation(String identifier) {
this(identifier, TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION);
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public UnreliableImplementation(String identifier,
TypeOfExceptionToFailWith exceptionToFailWith) {
this.identifier = identifier;
@ -147,15 +151,17 @@ public String failsIfIdentifierDoesntMatch(String identifier)
if (this.identifier.equals(identifier)) {
return identifier;
} else {
String message = "expected '" + this.identifier + "' but received '" +
identifier + "'";
switch (exceptionToFailWith) {
case STANDBY_EXCEPTION:
throw new StandbyException(identifier);
throw new StandbyException(message);
case UNRELIABLE_EXCEPTION:
throw new UnreliableException(identifier);
throw new UnreliableException(message);
case IO_EXCEPTION:
throw new IOException(identifier);
throw new IOException(message);
default:
throw new RuntimeException(identifier);
throw new RuntimeException(message);
}
}
}