YARN-2434. RM should not recover containers from previously failed attempt when AM restart is not enabled. Contributed by Jian He

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1619614 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jason Darrell Lowe 2014-08-21 22:41:34 +00:00
parent da4ba50269
commit 4236c6600e
3 changed files with 29 additions and 0 deletions

View File

@ -231,6 +231,9 @@ Release 2.6.0 - UNRELEASED
YARN-2424. LCE should support non-cgroups, non-secure mode (Chris Douglas
via aw)
YARN-2434. RM should not recover containers from previously failed attempt
when AM restart is not enabled (Jian He via jlowe)
Release 2.5.0 - 2014-08-11
INCOMPATIBLE CHANGES

View File

@ -273,6 +273,19 @@ public synchronized void recoverContainersOnNode(
SchedulerApplicationAttempt schedulerAttempt =
schedulerApp.getCurrentAppAttempt();
if (!rmApp.getApplicationSubmissionContext()
.getKeepContainersAcrossApplicationAttempts()) {
// Do not recover containers for stopped attempt or previous attempt.
if (schedulerAttempt.isStopped()
|| !schedulerAttempt.getApplicationAttemptId().equals(
container.getContainerId().getApplicationAttemptId())) {
LOG.info("Skip recovering container " + container
+ " for already stopped attempt.");
killOrphanContainerOnNode(nm, container);
continue;
}
}
// create container
RMContainer rmContainer = recoverAndCreateContainer(container, nm);

View File

@ -513,6 +513,19 @@ public void testAMfailedBetweenRMRestart() throws Exception {
// just-recovered containers.
assertNull(scheduler.getRMContainer(runningContainer.getContainerId()));
assertNull(scheduler.getRMContainer(completedContainer.getContainerId()));
rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
MockNM nm2 =
new MockNM("127.1.1.1:4321", 8192, rm2.getResourceTrackerService());
NMContainerStatus previousAttemptContainer =
TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 4,
ContainerState.RUNNING);
nm2.registerNode(Arrays.asList(previousAttemptContainer), null);
// Wait for RM to settle down on recovering containers;
Thread.sleep(3000);
// check containers from previous failed attempt should not be recovered.
assertNull(scheduler.getRMContainer(previousAttemptContainer.getContainerId()));
}
// Apps already completed before RM restart. Restarted RM scheduler should not