YARN-10154. Addendum Patch which fixes below bugs
1. RM fails to start when LeafQueueTemplate max capacity is not specified. 2. Job stuck in ACCEPTED state with DominantResourceCalculator as Queue Capacity is set to NaN during RM startup with clusterResource is zero. Reviewed by Sunil G and Manikandan R.
This commit is contained in:
parent
8ffc356b1e
commit
450e5aa9dd
@ -192,40 +192,55 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
|
||||
*
|
||||
*/
|
||||
if (this.capacityConfigType.equals(CapacityConfigType.ABSOLUTE_RESOURCE)) {
|
||||
for (String label : queueCapacities.getExistingNodeLabels()) {
|
||||
queueCapacities.setCapacity(label,
|
||||
this.csContext.getResourceCalculator().divide(
|
||||
this.csContext.getClusterResource(),
|
||||
this.csContext.getConfiguration().getMinimumResourceRequirement(
|
||||
label,
|
||||
this.csContext.getConfiguration()
|
||||
.getAutoCreatedQueueTemplateConfPrefix(getQueuePath()),
|
||||
resourceTypes),
|
||||
getQueueResourceQuotas().getConfiguredMinResource(label)));
|
||||
|
||||
queueCapacities.setMaximumCapacity(label,
|
||||
this.csContext.getResourceCalculator().divide(
|
||||
this.csContext.getClusterResource(),
|
||||
this.csContext.getConfiguration().getMaximumResourceRequirement(
|
||||
label,
|
||||
this.csContext.getConfiguration()
|
||||
.getAutoCreatedQueueTemplateConfPrefix(getQueuePath()),
|
||||
resourceTypes),
|
||||
getQueueResourceQuotas().getConfiguredMaxResource(label)));
|
||||
|
||||
queueCapacities.setAbsoluteCapacity(label,
|
||||
queueCapacities.getCapacity(label)
|
||||
* getQueueCapacities().getAbsoluteCapacity(label));
|
||||
|
||||
queueCapacities.setAbsoluteMaximumCapacity(label,
|
||||
queueCapacities.getMaximumCapacity(label)
|
||||
* getQueueCapacities().getAbsoluteMaximumCapacity(label));
|
||||
}
|
||||
updateQueueCapacities(queueCapacities);
|
||||
}
|
||||
builder.capacities(queueCapacities);
|
||||
return builder;
|
||||
}
|
||||
|
||||
private void updateQueueCapacities(QueueCapacities queueCapacities) {
|
||||
for (String label : queueCapacities.getExistingNodeLabels()) {
|
||||
queueCapacities.setCapacity(label,
|
||||
this.csContext.getResourceCalculator().divide(
|
||||
this.csContext.getClusterResource(),
|
||||
this.csContext.getConfiguration().getMinimumResourceRequirement(
|
||||
label,
|
||||
this.csContext.getConfiguration()
|
||||
.getAutoCreatedQueueTemplateConfPrefix(getQueuePath()),
|
||||
resourceTypes),
|
||||
getQueueResourceQuotas().getConfiguredMinResource(label)));
|
||||
|
||||
Resource childMaxResource = this.csContext.getConfiguration()
|
||||
.getMaximumResourceRequirement(label,
|
||||
this.csContext.getConfiguration()
|
||||
.getAutoCreatedQueueTemplateConfPrefix(getQueuePath()),
|
||||
resourceTypes);
|
||||
Resource parentMaxRes = getQueueResourceQuotas()
|
||||
.getConfiguredMaxResource(label);
|
||||
|
||||
Resource effMaxResource = Resources.min(
|
||||
this.csContext.getResourceCalculator(),
|
||||
this.csContext.getClusterResource(),
|
||||
childMaxResource.equals(Resources.none()) ? parentMaxRes
|
||||
: childMaxResource,
|
||||
parentMaxRes);
|
||||
|
||||
queueCapacities.setMaximumCapacity(
|
||||
label, this.csContext.getResourceCalculator().divide(
|
||||
this.csContext.getClusterResource(),
|
||||
effMaxResource,
|
||||
getQueueResourceQuotas().getConfiguredMaxResource(label)));
|
||||
|
||||
queueCapacities.setAbsoluteCapacity(
|
||||
label, queueCapacities.getCapacity(label)
|
||||
* getQueueCapacities().getAbsoluteCapacity(label));
|
||||
|
||||
queueCapacities.setAbsoluteMaximumCapacity(label,
|
||||
queueCapacities.getMaximumCapacity(label)
|
||||
* getQueueCapacities().getAbsoluteMaximumCapacity(label));
|
||||
}
|
||||
}
|
||||
|
||||
protected void validate(final CSQueue newlyParsedQueue) throws IOException {
|
||||
// Sanity check
|
||||
if (!(newlyParsedQueue instanceof ManagedParentQueue) || !newlyParsedQueue
|
||||
@ -276,6 +291,16 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
|
||||
|
||||
AutoCreatedLeafQueue leafQueue = (AutoCreatedLeafQueue) childQueue;
|
||||
super.addChildQueue(leafQueue);
|
||||
|
||||
/* Below is to avoid Setting Queue Capacity to NaN when ClusterResource
|
||||
is zero during RM Startup with DominantResourceCalculator */
|
||||
if (this.capacityConfigType.equals(
|
||||
CapacityConfigType.ABSOLUTE_RESOURCE)) {
|
||||
QueueCapacities queueCapacities =
|
||||
getLeafQueueTemplate().getQueueCapacities();
|
||||
updateQueueCapacities(queueCapacities);
|
||||
}
|
||||
|
||||
final AutoCreatedLeafQueueConfig initialLeafQueueTemplate =
|
||||
queueManagementPolicy.getInitialLeafQueueConfiguration(leafQueue);
|
||||
|
||||
|
@ -22,6 +22,7 @@ import static org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager.NO_LABEL
|
||||
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueueUtils.EPSILON;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.util.HashMap;
|
||||
@ -33,11 +34,17 @@ import java.util.Set;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.queuemanagement.GuaranteedOrZeroCapacityOverTimePolicy;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.FifoOrderingPolicy;
|
||||
import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.slf4j.Logger;
|
||||
@ -274,4 +281,38 @@ public class TestAbsoluteResourceWithAutoQueue
|
||||
fail("Exception should be thrown as leaf queue template configuration is "
|
||||
+ "not same as Parent configuration");
|
||||
}
|
||||
|
||||
@Test(timeout = 20000)
|
||||
public void testApplicationRunningWithDRF() throws Exception {
|
||||
CapacitySchedulerConfiguration csConf =
|
||||
setupSimpleQueueConfiguration(false);
|
||||
setupMinMaxResourceConfiguration(csConf);
|
||||
csConf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
|
||||
ResourceScheduler.class);
|
||||
|
||||
// Validate Leaf Queue Template in Absolute Resource with DRF
|
||||
csConf.setResourceComparator(DominantResourceCalculator.class);
|
||||
setupGroupQueueMappings(QUEUED, csConf, "%user");
|
||||
|
||||
mockRM = new MockRM(csConf);
|
||||
mockRM.start();
|
||||
|
||||
MockNM nm1 = mockRM.registerNode("127.0.0.1:1234", 250 * GB, 40);
|
||||
|
||||
// Submit a Application and validate if it is moving to RUNNING state
|
||||
RMApp app1 = MockRMAppSubmitter.submit(mockRM,
|
||||
MockRMAppSubmissionData.Builder.createWithMemory(1024, mockRM)
|
||||
.withAppName("app1")
|
||||
.withUser(TEST_GROUPUSER)
|
||||
.withAcls(null)
|
||||
.build());
|
||||
MockAM am1 = MockRM.launchAndRegisterAM(app1, mockRM, nm1);
|
||||
|
||||
cs = (CapacityScheduler) mockRM.getResourceScheduler();
|
||||
AutoCreatedLeafQueue autoCreatedLeafQueue =
|
||||
(AutoCreatedLeafQueue) cs.getQueue(TEST_GROUPUSER);
|
||||
assertNotNull("Auto Creation of Queue failed", autoCreatedLeafQueue);
|
||||
ManagedParentQueue parentQueue = (ManagedParentQueue) cs.getQueue(QUEUED);
|
||||
assertEquals(parentQueue, autoCreatedLeafQueue.getParent());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user