YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
This commit is contained in:
parent
05e04f34f2
commit
4513761869
@ -541,6 +541,9 @@ Release 2.7.1 - UNRELEASED
|
|||||||
YARN-3646. Applications are getting stuck some times in case of retry
|
YARN-3646. Applications are getting stuck some times in case of retry
|
||||||
policy forever. (Raju Bairishetti via devaraj)
|
policy forever. (Raju Bairishetti via devaraj)
|
||||||
|
|
||||||
|
YARN-3675. FairScheduler: RM quits when node removal races with
|
||||||
|
continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
|
||||||
|
|
||||||
Release 2.7.0 - 2015-04-20
|
Release 2.7.0 - 2015-04-20
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
@ -1040,12 +1040,22 @@ public int compare(NodeId n1, NodeId n2) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private synchronized void attemptScheduling(FSSchedulerNode node) {
|
@VisibleForTesting
|
||||||
|
synchronized void attemptScheduling(FSSchedulerNode node) {
|
||||||
if (rmContext.isWorkPreservingRecoveryEnabled()
|
if (rmContext.isWorkPreservingRecoveryEnabled()
|
||||||
&& !rmContext.isSchedulerReadyForAllocatingContainers()) {
|
&& !rmContext.isSchedulerReadyForAllocatingContainers()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final NodeId nodeID = node.getNodeID();
|
||||||
|
if (!nodes.containsKey(nodeID)) {
|
||||||
|
// The node might have just been removed while this thread was waiting
|
||||||
|
// on the synchronized lock before it entered this synchronized method
|
||||||
|
LOG.info("Skipping scheduling as the node " + nodeID +
|
||||||
|
" has been removed");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Assign new containers...
|
// Assign new containers...
|
||||||
// 1. Check for reserved applications
|
// 1. Check for reserved applications
|
||||||
// 2. Schedule if there are no reservations
|
// 2. Schedule if there are no reservations
|
||||||
|
@ -3889,6 +3889,50 @@ public void testContinuousSchedulingWithNodeRemoved() throws Exception {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSchedulingOnRemovedNode() throws Exception {
|
||||||
|
// Disable continuous scheduling, will invoke continuous scheduling manually
|
||||||
|
scheduler.init(conf);
|
||||||
|
scheduler.start();
|
||||||
|
Assert.assertTrue("Continuous scheduling should be disabled.",
|
||||||
|
!scheduler.isContinuousSchedulingEnabled());
|
||||||
|
|
||||||
|
ApplicationAttemptId id11 = createAppAttemptId(1, 1);
|
||||||
|
createMockRMApp(id11);
|
||||||
|
|
||||||
|
scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1",
|
||||||
|
false);
|
||||||
|
scheduler.addApplicationAttempt(id11, false, false);
|
||||||
|
|
||||||
|
List<ResourceRequest> ask1 = new ArrayList<>();
|
||||||
|
ResourceRequest request1 =
|
||||||
|
createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true);
|
||||||
|
|
||||||
|
ask1.add(request1);
|
||||||
|
scheduler.allocate(id11, ask1, new ArrayList<ContainerId>(), null,
|
||||||
|
null);
|
||||||
|
|
||||||
|
String hostName = "127.0.0.1";
|
||||||
|
RMNode node1 = MockNodes.newNodeInfo(1,
|
||||||
|
Resources.createResource(8 * 1024, 8), 1, hostName);
|
||||||
|
NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
|
||||||
|
scheduler.handle(nodeEvent1);
|
||||||
|
|
||||||
|
FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode(
|
||||||
|
node1.getNodeID());
|
||||||
|
|
||||||
|
NodeRemovedSchedulerEvent removeNode1 =
|
||||||
|
new NodeRemovedSchedulerEvent(node1);
|
||||||
|
scheduler.handle(removeNode1);
|
||||||
|
|
||||||
|
scheduler.attemptScheduling(node);
|
||||||
|
|
||||||
|
AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
|
||||||
|
new AppAttemptRemovedSchedulerEvent(id11,
|
||||||
|
RMAppAttemptState.FINISHED, false);
|
||||||
|
scheduler.handle(appRemovedEvent1);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
|
public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
Loading…
Reference in New Issue
Block a user