diff --git a/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt index 3207e70c38..216b562210 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt @@ -5,4 +5,8 @@ branch is merged. ------------------------------ HADOOP-7455. HA: Introduce HA Service Protocol Interface. (suresh) + HADOOP-7774. HA: Administrative CLI to control HA daemons. (todd) + +HADOOP-7896. HA: if both NNs are in Standby mode, client needs to try failing + back and forth several times with sleeps. (atm) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java index f928760253..d165577825 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java @@ -24,11 +24,11 @@ import java.lang.reflect.Method; import java.util.Collections; import java.util.Map; -import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.retry.RetryPolicy.RetryAction; +import org.apache.hadoop.util.ThreadUtil; class RetryInvocationHandler implements InvocationHandler, Closeable { public static final Log LOG = LogFactory.getLog(RetryInvocationHandler.class); @@ -85,31 +85,38 @@ public Object invoke(Object proxy, Method method, Object[] args) .isAnnotationPresent(Idempotent.class); RetryAction action = policy.shouldRetry(e, retries++, invocationFailoverCount, isMethodIdempotent); - if (action == RetryAction.FAIL) { + if (action.action == RetryAction.RetryDecision.FAIL) { LOG.warn("Exception while invoking " + method.getName() + " of " + currentProxy.getClass() + ". Not retrying.", e); if (!method.getReturnType().equals(Void.TYPE)) { throw e; // non-void methods can't fail without an exception } return null; - } else if (action == RetryAction.FAILOVER_AND_RETRY) { - LOG.warn("Exception while invoking " + method.getName() - + " of " + currentProxy.getClass() - + " after " + invocationFailoverCount + " fail over attempts." - + " Trying to fail over.", e); - // Make sure that concurrent failed method invocations only cause a - // single actual fail over. - synchronized (proxyProvider) { - if (invocationAttemptFailoverCount == proxyProviderFailoverCount) { - proxyProvider.performFailover(currentProxy); - proxyProviderFailoverCount++; - currentProxy = proxyProvider.getProxy(); - } else { - LOG.warn("A failover has occurred since the start of this method" - + " invocation attempt."); - } + } else { // retry or failover + + if (action.delayMillis > 0) { + ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis); + } + + if (action.action == RetryAction.RetryDecision.FAILOVER_AND_RETRY) { + LOG.warn("Exception while invoking " + method.getName() + + " of " + currentProxy.getClass() + + " after " + invocationFailoverCount + " fail over attempts." + + " Trying to fail over.", e); + // Make sure that concurrent failed method invocations only cause a + // single actual fail over. + synchronized (proxyProvider) { + if (invocationAttemptFailoverCount == proxyProviderFailoverCount) { + proxyProvider.performFailover(currentProxy); + proxyProviderFailoverCount++; + currentProxy = proxyProvider.getProxy(); + } else { + LOG.warn("A failover has occurred since the start of this method" + + " invocation attempt."); + } + } + invocationFailoverCount++; } - invocationFailoverCount++; } if(LOG.isDebugEnabled()) { LOG.debug("Exception while invoking " + method.getName() diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java index 3634e18673..5afda59475 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java @@ -33,6 +33,8 @@ import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.StandbyException; +import com.google.common.annotations.VisibleForTesting; + /** *
* A collection of useful implementations of {@link RetryPolicy}. @@ -42,6 +44,8 @@ public class RetryPolicies { public static final Log LOG = LogFactory.getLog(RetryPolicies.class); + private static final Random RAND = new Random(); + /** *
* Try once, and fail by re-throwing the exception.
@@ -137,7 +141,14 @@ public static final RetryPolicy failoverOnNetworkException(int maxFailovers) {
public static final RetryPolicy failoverOnNetworkException(
RetryPolicy fallbackPolicy, int maxFailovers) {
- return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers);
+ return failoverOnNetworkException(fallbackPolicy, maxFailovers, 0, 0);
+ }
+
+ public static final RetryPolicy failoverOnNetworkException(
+ RetryPolicy fallbackPolicy, int maxFailovers, long delayMillis,
+ long maxDelayBase) {
+ return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers,
+ delayMillis, maxDelayBase);
}
static class TryOnceThenFail implements RetryPolicy {
@@ -176,12 +187,8 @@ public RetryAction shouldRetry(Exception e, int retries, int failovers,
if (retries >= maxRetries) {
throw e;
}
- try {
- timeUnit.sleep(calculateSleepTime(retries));
- } catch (InterruptedException ie) {
- // retry
- }
- return RetryAction.RETRY;
+ return new RetryAction(RetryAction.RetryDecision.RETRY,
+ timeUnit.toMillis(calculateSleepTime(retries)));
}
protected abstract long calculateSleepTime(int retries);
@@ -268,7 +275,7 @@ public RetryAction shouldRetry(Exception e, int retries, int failovers,
}
static class ExponentialBackoffRetry extends RetryLimited {
- private Random r = new Random();
+
public ExponentialBackoffRetry(
int maxRetries, long sleepTime, TimeUnit timeUnit) {
super(maxRetries, sleepTime, timeUnit);
@@ -276,16 +283,19 @@ public ExponentialBackoffRetry(
@Override
protected long calculateSleepTime(int retries) {
- return sleepTime*r.nextInt(1<<(retries+1));
+ return calculateExponentialTime(sleepTime, retries + 1);
}
}
- /*
+ /**
* Fail over and retry in the case of:
* Remote StandbyException (server is up, but is not the active server)
* Immediate socket exceptions (e.g. no route to host, econnrefused)
* Socket exceptions after initial connection when operation is idempotent
*
+ * The first failover is immediate, while all subsequent failovers wait an
+ * exponentially-increasing random amount of time.
+ *
* Fail immediately in the case of:
* Socket exceptions after initial connection when operation is not idempotent
*
@@ -295,11 +305,20 @@ static class FailoverOnNetworkExceptionRetry implements RetryPolicy {
private RetryPolicy fallbackPolicy;
private int maxFailovers;
+ private long delayMillis;
+ private long maxDelayBase;
public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy,
int maxFailovers) {
+ this(fallbackPolicy, maxFailovers, 0, 0);
+ }
+
+ public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy,
+ int maxFailovers, long delayMillis, long maxDelayBase) {
this.fallbackPolicy = fallbackPolicy;
this.maxFailovers = maxFailovers;
+ this.delayMillis = delayMillis;
+ this.maxDelayBase = maxDelayBase;
}
@Override
@@ -314,8 +333,13 @@ public RetryAction shouldRetry(Exception e, int retries,
if (e instanceof ConnectException ||
e instanceof NoRouteToHostException ||
e instanceof UnknownHostException ||
- e instanceof StandbyException) {
- return RetryAction.FAILOVER_AND_RETRY;
+ e instanceof StandbyException ||
+ isWrappedStandbyException(e)) {
+ return new RetryAction(
+ RetryAction.RetryDecision.FAILOVER_AND_RETRY,
+ // retry immediately if this is our first failover, sleep otherwise
+ failovers == 0 ? 0 :
+ calculateExponentialTime(delayMillis, failovers, maxDelayBase));
} else if (e instanceof SocketException ||
e instanceof IOException) {
if (isMethodIdempotent) {
@@ -330,4 +354,34 @@ public RetryAction shouldRetry(Exception e, int retries,
}
}
+
+ /**
+ * Return a value which is time
increasing exponentially as a
+ * function of retries
, +/- 0%-50% of that value, chosen
+ * randomly.
+ *
+ * @param time the base amount of time to work with
+ * @param retries the number of retries that have so occurred so far
+ * @param cap value at which to cap the base sleep time
+ * @return an amount of time to sleep
+ */
+ @VisibleForTesting
+ public static long calculateExponentialTime(long time, int retries,
+ long cap) {
+ long baseTime = Math.min(time * ((long)1 << retries), cap);
+ return (long) (baseTime * (RAND.nextFloat() + 0.5));
+ }
+
+ private static long calculateExponentialTime(long time, int retries) {
+ return calculateExponentialTime(time, retries, Long.MAX_VALUE);
+ }
+
+ private static boolean isWrappedStandbyException(Exception e) {
+ if (!(e instanceof RemoteException)) {
+ return false;
+ }
+ Exception unwrapped = ((RemoteException)e).unwrapRemoteException(
+ StandbyException.class);
+ return unwrapped instanceof StandbyException;
+ }
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java
index 4c4534ffb7..90e5eaea67 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java
@@ -19,7 +19,6 @@
import org.apache.hadoop.classification.InterfaceStability;
-
/**
*
* Specifies a policy for retrying method failures. @@ -33,10 +32,33 @@ public interface RetryPolicy { * Returned by {@link RetryPolicy#shouldRetry(Exception, int, int, boolean)}. */ @InterfaceStability.Evolving - public enum RetryAction { - FAIL, - RETRY, - FAILOVER_AND_RETRY + public static class RetryAction { + + // A few common retry policies, with no delays. + public static final RetryAction FAIL = + new RetryAction(RetryDecision.FAIL); + public static final RetryAction RETRY = + new RetryAction(RetryDecision.RETRY); + public static final RetryAction FAILOVER_AND_RETRY = + new RetryAction(RetryDecision.FAILOVER_AND_RETRY); + + public final RetryDecision action; + public final long delayMillis; + + public RetryAction(RetryDecision action) { + this(action, 0); + } + + public RetryAction(RetryDecision action, long delayTime) { + this.action = action; + this.delayMillis = delayTime; + } + + public enum RetryDecision { + FAIL, + RETRY, + FAILOVER_AND_RETRY + } } /** diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java new file mode 100644 index 0000000000..535ac34122 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.classification.InterfaceStability; + +@InterfaceStability.Evolving +public class ThreadUtil { + + private static final Log LOG = LogFactory.getLog(ThreadUtil.class); + + /** + * Cause the current thread to sleep as close as possible to the provided + * number of milliseconds. This method will log and ignore any + * {@link InterrupedException} encountered. + * + * @param millis the number of milliseconds for the current thread to sleep + */ + public static void sleepAtLeastIgnoreInterrupts(long millis) { + long start = System.currentTimeMillis(); + while (System.currentTimeMillis() - start < millis) { + long timeToSleep = millis - + (System.currentTimeMillis() - start); + try { + Thread.sleep(timeToSleep); + } catch (InterruptedException ie) { + LOG.warn("interrupted while sleeping", ie); + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java index eec4797ab3..b52814cfc1 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java @@ -25,6 +25,7 @@ import org.apache.hadoop.io.retry.UnreliableImplementation.TypeOfExceptionToFailWith; import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException; import org.apache.hadoop.ipc.StandbyException; +import org.apache.hadoop.util.ThreadUtil; import org.junit.Test; public class TestFailoverProxy { @@ -267,4 +268,40 @@ public void testConcurrentMethodFailures() throws InterruptedException { assertEquals("impl2", t2.result); assertEquals(1, proxyProvider.getFailoversOccurred()); } + + /** + * Ensure that when all configured services are throwing StandbyException + * that we fail over back and forth between them until one is no longer + * throwing StandbyException. + */ + @Test + public void testFailoverBetweenMultipleStandbys() + throws UnreliableException, StandbyException, IOException { + + final long millisToSleep = 10000; + + final UnreliableImplementation impl1 = new UnreliableImplementation("impl1", + TypeOfExceptionToFailWith.STANDBY_EXCEPTION); + FlipFlopProxyProvider proxyProvider = new FlipFlopProxyProvider( + UnreliableInterface.class, + impl1, + new UnreliableImplementation("impl2", + TypeOfExceptionToFailWith.STANDBY_EXCEPTION)); + + final UnreliableInterface unreliable = (UnreliableInterface)RetryProxy + .create(UnreliableInterface.class, proxyProvider, + RetryPolicies.failoverOnNetworkException( + RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000)); + + new Thread() { + @Override + public void run() { + ThreadUtil.sleepAtLeastIgnoreInterrupts(millisToSleep); + impl1.setIdentifier("renamed-impl1"); + } + }.start(); + + String result = unreliable.failsIfIdentifierDoesntMatch("renamed-impl1"); + assertEquals("renamed-impl1", result); + } } \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java index 7fa88b3b08..74a63894d8 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java @@ -48,6 +48,10 @@ public UnreliableImplementation(String identifier) { this(identifier, TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION); } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } + public UnreliableImplementation(String identifier, TypeOfExceptionToFailWith exceptionToFailWith) { this.identifier = identifier; @@ -147,15 +151,17 @@ public String failsIfIdentifierDoesntMatch(String identifier) if (this.identifier.equals(identifier)) { return identifier; } else { + String message = "expected '" + this.identifier + "' but received '" + + identifier + "'"; switch (exceptionToFailWith) { case STANDBY_EXCEPTION: - throw new StandbyException(identifier); + throw new StandbyException(message); case UNRELIABLE_EXCEPTION: - throw new UnreliableException(identifier); + throw new UnreliableException(message); case IO_EXCEPTION: - throw new IOException(identifier); + throw new IOException(message); default: - throw new RuntimeException(identifier); + throw new RuntimeException(message); } } }