YARN-7382. NoSuchElementException in FairScheduler after failover causes RM crash (rkanter)

This commit is contained in:
Robert Kanter 2017-10-24 10:21:44 -07:00
parent 679f99b145
commit 025c656572
2 changed files with 28 additions and 3 deletions

View File

@ -658,6 +658,16 @@ public synchronized void recoverContainer(SchedulerNode node,
if (!rmContainer.getState().equals(RMContainerState.COMPLETED)) { if (!rmContainer.getState().equals(RMContainerState.COMPLETED)) {
getQueue().incUsedResource(rmContainer.getContainer().getResource()); getQueue().incUsedResource(rmContainer.getContainer().getResource());
} }
// If not running unmanaged, the first container we recover is always
// the AM. Set the amResource for this app and update the leaf queue's AM
// usage
if (!isAmRunning() && !getUnmanagedAM()) {
Resource resource = rmContainer.getAllocatedResource();
setAMResource(resource);
getQueue().addAMResourceUsage(resource);
setAmRunning(true);
}
} finally { } finally {
writeLock.unlock(); writeLock.unlock();
} }

View File

@ -66,6 +66,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSParentQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSParentQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSQueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerTestBase; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerTestBase;
@ -158,6 +159,7 @@ public void testSchedulerRecovery() throws Exception {
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService()); new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode(); nm1.registerNode();
RMApp app1 = rm1.submitApp(200); RMApp app1 = rm1.submitApp(200);
Resource amResources = app1.getAMResourceRequests().get(0).getCapability();
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// clear queue metrics // clear queue metrics
@ -240,7 +242,8 @@ public void testSchedulerRecovery() throws Exception {
if (getSchedulerType() == SchedulerType.CAPACITY) { if (getSchedulerType() == SchedulerType.CAPACITY) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2); checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else { } else {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources); checkFSQueue(rm2, schedulerApp, usedResources, availableResources,
amResources);
} }
// *********** check scheduler attempt state.******** // *********** check scheduler attempt state.********
@ -310,6 +313,7 @@ public void testDynamicQueueRecovery() throws Exception {
RMApp app1 = rm1.submitApp(200, "dynamicQApp", RMApp app1 = rm1.submitApp(200, "dynamicQApp",
UserGroupInformation.getCurrentUser().getShortUserName(), null, UserGroupInformation.getCurrentUser().getShortUserName(), null,
ReservationSystemTestUtil.getReservationQueueName()); ReservationSystemTestUtil.getReservationQueueName());
Resource amResources = app1.getAMResourceRequests().get(0).getCapability();
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// clear queue metrics // clear queue metrics
@ -384,7 +388,8 @@ public void testDynamicQueueRecovery() throws Exception {
if (getSchedulerType() == SchedulerType.CAPACITY) { if (getSchedulerType() == SchedulerType.CAPACITY) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2); checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else { } else {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources); checkFSQueue(rm2, schedulerApp, usedResources, availableResources,
amResources);
} }
// *********** check scheduler attempt state.******** // *********** check scheduler attempt state.********
@ -456,7 +461,7 @@ private void checkCSLeafQueue(MockRM rm,
private void checkFSQueue(ResourceManager rm, private void checkFSQueue(ResourceManager rm,
SchedulerApplication schedulerApp, Resource usedResources, SchedulerApplication schedulerApp, Resource usedResources,
Resource availableResources) throws Exception { Resource availableResources, Resource amResources) throws Exception {
// waiting for RM's scheduling apps // waiting for RM's scheduling apps
int retry = 0; int retry = 0;
Resource assumedFairShare = Resource.newInstance(8192, 8); Resource assumedFairShare = Resource.newInstance(8192, 8);
@ -488,6 +493,16 @@ private void checkFSQueue(ResourceManager rm,
assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemorySize(), assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemorySize(),
availableResources.getVirtualCores(), usedResources.getMemorySize(), availableResources.getVirtualCores(), usedResources.getMemorySize(),
usedResources.getVirtualCores()); usedResources.getVirtualCores());
// ************ check AM resources ****************
assertEquals(amResources,
schedulerApp.getCurrentAppAttempt().getAMResource());
FSQueueMetrics fsQueueMetrics =
(FSQueueMetrics) schedulerApp.getQueue().getMetrics();
assertEquals(amResources.getMemorySize(),
fsQueueMetrics.getAMResourceUsageMB());
assertEquals(amResources.getVirtualCores(),
fsQueueMetrics.getAMResourceUsageVCores());
} }
// create 3 container reports for AM // create 3 container reports for AM