diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index e7f28d05cb..17fbbc3660 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -893,6 +893,11 @@ public static boolean isAclEnabled(Configuration conf) { NM_PREFIX + "container-diagnostics-maximum-size"; public static final int DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE = 10000; + /** Minimum container restart interval. */ + public static final String NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS = + NM_PREFIX + "container-retry-minimum-interval-ms"; + public static final int DEFAULT_NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS = 1000; + /** Interval at which the delayed token removal thread runs */ public static final String RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS = RM_PREFIX + "delayed.delegation-token.removal-interval-ms"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index c9e761a247..703c03df0f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1588,6 +1588,12 @@ 10000 + + Minimum container restart interval in milliseconds. + yarn.nodemanager.container-retry-minimum-interval-ms + 1000 + + Max number of threads in NMClientAsync to process container management events diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index b1ddc2ef95..2ff8e7ccf3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -33,6 +33,7 @@ import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -155,6 +156,16 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, this.containerRetryContext = ContainerRetryContext.NEVER_RETRY_CONTEXT; } this.remainingRetryAttempts = containerRetryContext.getMaxRetries(); + int minimumRestartInterval = conf.getInt( + YarnConfiguration.NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS); + if (containerRetryContext.getRetryPolicy() + != ContainerRetryPolicy.NEVER_RETRY + && containerRetryContext.getRetryInterval() < minimumRestartInterval) { + LOG.info("Set restart interval to minimum value " + minimumRestartInterval + + "ms for container " + containerTokenIdentifier.getContainerID()); + this.containerRetryContext.setRetryInterval(minimumRestartInterval); + } this.diagnosticsMaxSize = conf.getInt( YarnConfiguration.NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE, YarnConfiguration.DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE); @@ -1368,4 +1379,9 @@ private static boolean shouldBeUploadedToSharedCache(ContainerImpl container, LocalResourceRequest resource) { return container.resourcesUploadPolicies.get(resource); } + + @VisibleForTesting + ContainerRetryContext getContainerRetryContext() { + return containerRetryContext; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java index 118bc42bcb..766a1f91fe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java @@ -725,6 +725,40 @@ private void testContainerRetry(ContainerRetryContext containerRetryContext, } } + @Test + public void testContainerRestartInterval() throws IOException { + conf.setInt(YarnConfiguration.NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS, 2000); + + ContainerRetryContext containerRetryContext1 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.NEVER_RETRY, null, 3, 0); + testContainerRestartInterval(containerRetryContext1, 0); + + ContainerRetryContext containerRetryContext2 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 3, 0); + testContainerRestartInterval(containerRetryContext2, 2000); + + ContainerRetryContext containerRetryContext3 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 3, 4000); + testContainerRestartInterval(containerRetryContext3, 4000); + } + + private void testContainerRestartInterval( + ContainerRetryContext containerRetryContext, + int expectedRestartInterval) throws IOException { + WrappedContainer wc = null; + try { + wc = new WrappedContainer(25, 314159265358980L, 4345, + "yak", containerRetryContext); + Assert.assertEquals( + ((ContainerImpl)wc.c).getContainerRetryContext().getRetryInterval(), + expectedRestartInterval); + } finally { + if (wc != null) { + wc.finished(); + } + } + } + private void verifyCleanupCall(WrappedContainer wc) throws Exception { ResourcesReleasedMatcher matchesReq = new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(