YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha)

This commit is contained in:
Karthik Kambatla 2015-05-21 13:38:30 -07:00
parent 05e04f34f2
commit 4513761869
3 changed files with 59 additions and 2 deletions

View File

@ -541,6 +541,9 @@ Release 2.7.1 - UNRELEASED
YARN-3646. Applications are getting stuck some times in case of retry YARN-3646. Applications are getting stuck some times in case of retry
policy forever. (Raju Bairishetti via devaraj) policy forever. (Raju Bairishetti via devaraj)
YARN-3675. FairScheduler: RM quits when node removal races with
continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
Release 2.7.0 - 2015-04-20 Release 2.7.0 - 2015-04-20
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1040,12 +1040,22 @@ public int compare(NodeId n1, NodeId n2) {
} }
} }
private synchronized void attemptScheduling(FSSchedulerNode node) { @VisibleForTesting
synchronized void attemptScheduling(FSSchedulerNode node) {
if (rmContext.isWorkPreservingRecoveryEnabled() if (rmContext.isWorkPreservingRecoveryEnabled()
&& !rmContext.isSchedulerReadyForAllocatingContainers()) { && !rmContext.isSchedulerReadyForAllocatingContainers()) {
return; return;
} }
final NodeId nodeID = node.getNodeID();
if (!nodes.containsKey(nodeID)) {
// The node might have just been removed while this thread was waiting
// on the synchronized lock before it entered this synchronized method
LOG.info("Skipping scheduling as the node " + nodeID +
" has been removed");
return;
}
// Assign new containers... // Assign new containers...
// 1. Check for reserved applications // 1. Check for reserved applications
// 2. Schedule if there are no reservations // 2. Schedule if there are no reservations

View File

@ -3889,6 +3889,50 @@ public void testContinuousSchedulingWithNodeRemoved() throws Exception {
} }
} }
@Test
public void testSchedulingOnRemovedNode() throws Exception {
// Disable continuous scheduling, will invoke continuous scheduling manually
scheduler.init(conf);
scheduler.start();
Assert.assertTrue("Continuous scheduling should be disabled.",
!scheduler.isContinuousSchedulingEnabled());
ApplicationAttemptId id11 = createAppAttemptId(1, 1);
createMockRMApp(id11);
scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1",
false);
scheduler.addApplicationAttempt(id11, false, false);
List<ResourceRequest> ask1 = new ArrayList<>();
ResourceRequest request1 =
createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true);
ask1.add(request1);
scheduler.allocate(id11, ask1, new ArrayList<ContainerId>(), null,
null);
String hostName = "127.0.0.1";
RMNode node1 = MockNodes.newNodeInfo(1,
Resources.createResource(8 * 1024, 8), 1, hostName);
NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
scheduler.handle(nodeEvent1);
FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode(
node1.getNodeID());
NodeRemovedSchedulerEvent removeNode1 =
new NodeRemovedSchedulerEvent(node1);
scheduler.handle(removeNode1);
scheduler.attemptScheduling(node);
AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
new AppAttemptRemovedSchedulerEvent(id11,
RMAppAttemptState.FINISHED, false);
scheduler.handle(appRemovedEvent1);
}
@Test @Test
public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured() public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
throws IOException { throws IOException {