diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index c09c7f1ac2..5527ac4a12 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1602,8 +1602,10 @@ public ContainerState transition(final ContainerImpl container, } container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n"); } - if (container.shouldRetry(container.exitCode)) { + // Updates to the retry context should be protected from concurrent + // writes. It should only be called from this transition. + container.retryPolicy.updateRetryContext(container.windowRetryContext); container.storeRetryContext(); doRelaunch(container, container.windowRetryContext.getRemainingRetries(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java index 0208879475..36a8b918c5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java @@ -42,38 +42,58 @@ public SlidingWindowRetryPolicy(Clock clock) { public boolean shouldRetry(RetryContext retryContext, int errorCode) { - ContainerRetryContext containerRC = retryContext - .containerRetryContext; + ContainerRetryContext containerRC = retryContext.containerRetryContext; Preconditions.checkNotNull(containerRC, "container retry context null"); ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy(); if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES && containerRC.getErrorCodes() != null && containerRC.getErrorCodes().contains(errorCode))) { - if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) { - return true; - } - int pendingRetries = calculatePendingRetries(retryContext); - updateRetryContext(retryContext, pendingRetries); - return pendingRetries > 0; + return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER + || calculateRemainingRetries(retryContext) > 0; } return false; } /** - * Calculates the pending number of retries. - *
- * When failuresValidityInterval is > 0, it also removes time entries from
- * restartTimes
which are outside the validity interval.
+ * Calculates the remaining number of retries.
*
- * @return the pending retries.
+ * @return the remaining retries.
*/
- private int calculatePendingRetries(RetryContext retryContext) {
+ private int calculateRemainingRetries(RetryContext retryContext) {
ContainerRetryContext containerRC =
retryContext.containerRetryContext;
if (containerRC.getFailuresValidityInterval() > 0) {
+ int validFailuresCount = 0;
+ long currentTime = clock.getTime();
+ for (int i = retryContext.restartTimes.size() - 1; i >= 0; i--) {
+ long restartTime = retryContext.restartTimes.get(i);
+ if (currentTime - restartTime
+ <= containerRC.getFailuresValidityInterval()) {
+ validFailuresCount++;
+ } else {
+ break;
+ }
+ }
+ return containerRC.getMaxRetries() - validFailuresCount;
+ } else {
+ return retryContext.getRemainingRetries();
+ }
+ }
+
+ /**
+ * Updates remaining retries and the restart time when
+ * required in the retryContext.
+ *
+ * When failuresValidityInterval is > 0, it also removes time entries from
+ * restartTimes
which are outside the validity interval.
+ */
+ protected void updateRetryContext(RetryContext retryContext) {
+ if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) {
+ ContainerRetryContext containerRC = retryContext.containerRetryContext;
Iterator