diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index c09c7f1ac2..5527ac4a12 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1602,8 +1602,10 @@ public ContainerState transition(final ContainerImpl container, } container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n"); } - if (container.shouldRetry(container.exitCode)) { + // Updates to the retry context should be protected from concurrent + // writes. It should only be called from this transition. + container.retryPolicy.updateRetryContext(container.windowRetryContext); container.storeRetryContext(); doRelaunch(container, container.windowRetryContext.getRemainingRetries(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java index 0208879475..36a8b918c5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java @@ -42,38 +42,58 @@ public SlidingWindowRetryPolicy(Clock clock) { public boolean shouldRetry(RetryContext retryContext, int errorCode) { - ContainerRetryContext containerRC = retryContext - .containerRetryContext; + ContainerRetryContext containerRC = retryContext.containerRetryContext; Preconditions.checkNotNull(containerRC, "container retry context null"); ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy(); if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES && containerRC.getErrorCodes() != null && containerRC.getErrorCodes().contains(errorCode))) { - if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) { - return true; - } - int pendingRetries = calculatePendingRetries(retryContext); - updateRetryContext(retryContext, pendingRetries); - return pendingRetries > 0; + return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER + || calculateRemainingRetries(retryContext) > 0; } return false; } /** - * Calculates the pending number of retries. - *

- * When failuresValidityInterval is > 0, it also removes time entries from - * restartTimes which are outside the validity interval. + * Calculates the remaining number of retries. * - * @return the pending retries. + * @return the remaining retries. */ - private int calculatePendingRetries(RetryContext retryContext) { + private int calculateRemainingRetries(RetryContext retryContext) { ContainerRetryContext containerRC = retryContext.containerRetryContext; if (containerRC.getFailuresValidityInterval() > 0) { + int validFailuresCount = 0; + long currentTime = clock.getTime(); + for (int i = retryContext.restartTimes.size() - 1; i >= 0; i--) { + long restartTime = retryContext.restartTimes.get(i); + if (currentTime - restartTime + <= containerRC.getFailuresValidityInterval()) { + validFailuresCount++; + } else { + break; + } + } + return containerRC.getMaxRetries() - validFailuresCount; + } else { + return retryContext.getRemainingRetries(); + } + } + + /** + * Updates remaining retries and the restart time when + * required in the retryContext. + *

+ * When failuresValidityInterval is > 0, it also removes time entries from + * restartTimes which are outside the validity interval. + */ + protected void updateRetryContext(RetryContext retryContext) { + if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) { + ContainerRetryContext containerRC = retryContext.containerRetryContext; Iterator iterator = retryContext.getRestartTimes().iterator(); long currentTime = clock.getTime(); + while (iterator.hasNext()) { long restartTime = iterator.next(); if (currentTime - restartTime @@ -83,23 +103,11 @@ private int calculatePendingRetries(RetryContext retryContext) { break; } } - return containerRC.getMaxRetries() - - retryContext.getRestartTimes().size(); + retryContext.setRemainingRetries(containerRC.getMaxRetries() - + retryContext.restartTimes.size()); + retryContext.getRestartTimes().add(currentTime); } else { - return retryContext.getRemainingRetries(); - } - } - - /** - * Updates remaining retries and the restart time when - * required in the retryContext. - */ - private void updateRetryContext(RetryContext retryContext, - int pendingRetries) { - retryContext.setRemainingRetries(pendingRetries - 1); - if (retryContext.containerRetryContext.getFailuresValidityInterval() - > 0) { - retryContext.getRestartTimes().add(clock.getTime()); + retryContext.remainingRetries--; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java index 04889a9ecf..bacf3bbf18 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java @@ -64,12 +64,18 @@ public void testFailuresValidityInterval() { new SlidingWindowRetryPolicy.RetryContext(retryContext); Assert.assertTrue("retry 1", retryPolicy.shouldRetry(windowRetryContext, 12)); + retryPolicy.updateRetryContext(windowRetryContext); + clock.setTime(20); Assert.assertTrue("retry 2", retryPolicy.shouldRetry(windowRetryContext, 12)); + retryPolicy.updateRetryContext(windowRetryContext); + clock.setTime(40); Assert.assertTrue("retry 3", retryPolicy.shouldRetry(windowRetryContext, 12)); + retryPolicy.updateRetryContext(windowRetryContext); + clock.setTime(45); Assert.assertFalse("retry failed", retryPolicy.shouldRetry(windowRetryContext, 12));