YARN-8193. YARN RM hangs abruptly (stops allocating resources) when running successive applications. (Zian Chen via wangda)
Change-Id: Ia83dd2499ee9000b9e09ae5a932f21a13c0ddee6
This commit is contained in:
parent
13d389bf51
commit
2a0fa50f9d
@ -179,11 +179,22 @@ private ContainerAllocation preCheckForNodeCandidateSet(
|
|||||||
// This is to make sure non-partitioned-resource-request will prefer
|
// This is to make sure non-partitioned-resource-request will prefer
|
||||||
// to be allocated to non-partitioned nodes
|
// to be allocated to non-partitioned nodes
|
||||||
int missedNonPartitionedRequestSchedulingOpportunity = 0;
|
int missedNonPartitionedRequestSchedulingOpportunity = 0;
|
||||||
|
AppPlacementAllocator appPlacementAllocator =
|
||||||
|
appInfo.getAppPlacementAllocator(schedulerKey);
|
||||||
|
if (null == appPlacementAllocator){
|
||||||
|
// This is possible when #pending resource decreased by a different
|
||||||
|
// thread.
|
||||||
|
ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
|
||||||
|
activitiesManager, node, application, priority,
|
||||||
|
ActivityDiagnosticConstant.PRIORITY_SKIPPED_BECAUSE_NULL_ANY_REQUEST);
|
||||||
|
return ContainerAllocation.PRIORITY_SKIPPED;
|
||||||
|
}
|
||||||
|
String requestPartition =
|
||||||
|
appPlacementAllocator.getPrimaryRequestedNodePartition();
|
||||||
|
|
||||||
// Only do this when request associated with given scheduler key accepts
|
// Only do this when request associated with given scheduler key accepts
|
||||||
// NO_LABEL under RESPECT_EXCLUSIVITY mode
|
// NO_LABEL under RESPECT_EXCLUSIVITY mode
|
||||||
if (StringUtils.equals(RMNodeLabelsManager.NO_LABEL,
|
if (StringUtils.equals(RMNodeLabelsManager.NO_LABEL, requestPartition)) {
|
||||||
appInfo.getAppPlacementAllocator(schedulerKey)
|
|
||||||
.getPrimaryRequestedNodePartition())) {
|
|
||||||
missedNonPartitionedRequestSchedulingOpportunity =
|
missedNonPartitionedRequestSchedulingOpportunity =
|
||||||
application.addMissedNonPartitionedRequestSchedulingOpportunity(
|
application.addMissedNonPartitionedRequestSchedulingOpportunity(
|
||||||
schedulerKey);
|
schedulerKey);
|
||||||
@ -261,12 +272,9 @@ ContainerAllocation tryAllocateOnNode(Resource clusterResource,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public float getLocalityWaitFactor(
|
public float getLocalityWaitFactor(int uniqAsks, int clusterNodes) {
|
||||||
SchedulerRequestKey schedulerKey, int clusterNodes) {
|
|
||||||
// Estimate: Required unique resources (i.e. hosts + racks)
|
// Estimate: Required unique resources (i.e. hosts + racks)
|
||||||
int requiredResources = Math.max(
|
int requiredResources = Math.max(uniqAsks - 1, 0);
|
||||||
application.getAppPlacementAllocator(schedulerKey)
|
|
||||||
.getUniqueLocationAsks() - 1, 0);
|
|
||||||
|
|
||||||
// waitFactor can't be more than '1'
|
// waitFactor can't be more than '1'
|
||||||
// i.e. no point skipping more than clustersize opportunities
|
// i.e. no point skipping more than clustersize opportunities
|
||||||
@ -296,10 +304,16 @@ private boolean canAssign(SchedulerRequestKey schedulerKey,
|
|||||||
if (rmContext.getScheduler().getNumClusterNodes() == 0) {
|
if (rmContext.getScheduler().getNumClusterNodes() == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int uniqLocationAsks = 0;
|
||||||
|
AppPlacementAllocator appPlacementAllocator =
|
||||||
|
application.getAppPlacementAllocator(schedulerKey);
|
||||||
|
if (appPlacementAllocator != null) {
|
||||||
|
uniqLocationAsks = appPlacementAllocator.getUniqueLocationAsks();
|
||||||
|
}
|
||||||
// If we have only ANY requests for this schedulerKey, we should not
|
// If we have only ANY requests for this schedulerKey, we should not
|
||||||
// delay its scheduling.
|
// delay its scheduling.
|
||||||
if (application.getAppPlacementAllocator(schedulerKey)
|
if (uniqLocationAsks == 1) {
|
||||||
.getUniqueLocationAsks() == 1) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -313,7 +327,7 @@ private boolean canAssign(SchedulerRequestKey schedulerKey,
|
|||||||
} else {
|
} else {
|
||||||
long requiredContainers =
|
long requiredContainers =
|
||||||
application.getOutstandingAsksCount(schedulerKey);
|
application.getOutstandingAsksCount(schedulerKey);
|
||||||
float localityWaitFactor = getLocalityWaitFactor(schedulerKey,
|
float localityWaitFactor = getLocalityWaitFactor(uniqLocationAsks,
|
||||||
rmContext.getScheduler().getNumClusterNodes());
|
rmContext.getScheduler().getNumClusterNodes());
|
||||||
// Cap the delay by the number of nodes in the cluster.
|
// Cap the delay by the number of nodes in the cluster.
|
||||||
return (Math.min(rmContext.getScheduler().getNumClusterNodes(),
|
return (Math.min(rmContext.getScheduler().getNumClusterNodes(),
|
||||||
@ -806,6 +820,12 @@ private ContainerAllocation allocate(Resource clusterResource,
|
|||||||
application.getAppSchedulingInfo().getAppPlacementAllocator(
|
application.getAppSchedulingInfo().getAppPlacementAllocator(
|
||||||
schedulerKey);
|
schedulerKey);
|
||||||
|
|
||||||
|
// This could be null when #pending request decreased by another thread.
|
||||||
|
if (schedulingPS == null) {
|
||||||
|
return new ContainerAllocation(reservedContainer, null,
|
||||||
|
AllocationState.QUEUE_SKIPPED);
|
||||||
|
}
|
||||||
|
|
||||||
result = ContainerAllocation.PRIORITY_SKIPPED;
|
result = ContainerAllocation.PRIORITY_SKIPPED;
|
||||||
|
|
||||||
Iterator<FiCaSchedulerNode> iter = schedulingPS.getPreferredNodeIterator(
|
Iterator<FiCaSchedulerNode> iter = schedulingPS.getPreferredNodeIterator(
|
||||||
|
Loading…
Reference in New Issue
Block a user