YARN-6775. CapacityScheduler: Improvements to assignContainers, avoid unnecessary canAssignToUser/Queue calls. (Nathan Roberts via wangda)
Change-Id: I84ccd54200ccbaae23018ef320028e42b4c3509a
This commit is contained in:
parent
b61ab8573e
commit
945c0958bb
@ -63,9 +63,14 @@ public static void recordRejectedAppActivityFromLeafQueue(
|
||||
SchedulerApplicationAttempt application, Priority priority,
|
||||
String diagnostic) {
|
||||
String type = "app";
|
||||
recordActivity(activitiesManager, node, application.getQueueName(),
|
||||
application.getApplicationId().toString(), priority,
|
||||
ActivityState.REJECTED, diagnostic, type);
|
||||
if (activitiesManager == null) {
|
||||
return;
|
||||
}
|
||||
if (activitiesManager.shouldRecordThisNode(node.getNodeID())) {
|
||||
recordActivity(activitiesManager, node, application.getQueueName(),
|
||||
application.getApplicationId().toString(), priority,
|
||||
ActivityState.REJECTED, diagnostic, type);
|
||||
}
|
||||
finishSkippedAppAllocationRecording(activitiesManager,
|
||||
application.getApplicationId(), ActivityState.REJECTED, diagnostic);
|
||||
}
|
||||
@ -203,8 +208,13 @@ public static class QUEUE {
|
||||
public static void recordQueueActivity(ActivitiesManager activitiesManager,
|
||||
SchedulerNode node, String parentQueueName, String queueName,
|
||||
ActivityState state, String diagnostic) {
|
||||
recordActivity(activitiesManager, node, parentQueueName, queueName, null,
|
||||
state, diagnostic, null);
|
||||
if (activitiesManager == null) {
|
||||
return;
|
||||
}
|
||||
if (activitiesManager.shouldRecordThisNode(node.getNodeID())) {
|
||||
recordActivity(activitiesManager, node, parentQueueName, queueName,
|
||||
null, state, diagnostic, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -266,13 +276,10 @@ public static void startNodeUpdateRecording(
|
||||
private static void recordActivity(ActivitiesManager activitiesManager,
|
||||
SchedulerNode node, String parentName, String childName,
|
||||
Priority priority, ActivityState state, String diagnostic, String type) {
|
||||
if (activitiesManager == null) {
|
||||
return;
|
||||
}
|
||||
if (activitiesManager.shouldRecordThisNode(node.getNodeID())) {
|
||||
activitiesManager.addSchedulingActivityForNode(node.getNodeID(),
|
||||
parentName, childName, priority != null ? priority.toString() : null,
|
||||
state, diagnostic, type);
|
||||
}
|
||||
|
||||
activitiesManager.addSchedulingActivityForNode(node.getNodeID(), parentName,
|
||||
childName, priority != null ? priority.toString() : null, state,
|
||||
diagnostic, type);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1026,6 +1026,8 @@ public CSAssignment assignContainers(Resource clusterResource,
|
||||
return CSAssignment.NULL_ASSIGNMENT;
|
||||
}
|
||||
|
||||
Map<String, CachedUserLimit> userLimits = new HashMap<>();
|
||||
boolean needAssignToQueueCheck = true;
|
||||
for (Iterator<FiCaSchedulerApp> assignmentIterator =
|
||||
orderingPolicy.getAssignmentIterator();
|
||||
assignmentIterator.hasNext(); ) {
|
||||
@ -1035,24 +1037,50 @@ public CSAssignment assignContainers(Resource clusterResource,
|
||||
node.getNodeID(), SystemClock.getInstance().getTime(), application);
|
||||
|
||||
// Check queue max-capacity limit
|
||||
if (!super.canAssignToThisQueue(clusterResource, ps.getPartition(),
|
||||
currentResourceLimits, application.getCurrentReservation(),
|
||||
schedulingMode)) {
|
||||
ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
|
||||
activitiesManager, node, application, application.getPriority(),
|
||||
ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT);
|
||||
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
|
||||
getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
|
||||
ActivityDiagnosticConstant.EMPTY);
|
||||
return CSAssignment.NULL_ASSIGNMENT;
|
||||
Resource appReserved = application.getCurrentReservation();
|
||||
if (needAssignToQueueCheck) {
|
||||
if (!super.canAssignToThisQueue(clusterResource, node.getPartition(),
|
||||
currentResourceLimits, appReserved, schedulingMode)) {
|
||||
ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
|
||||
activitiesManager, node, application, application.getPriority(),
|
||||
ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT);
|
||||
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
|
||||
getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
|
||||
ActivityDiagnosticConstant.EMPTY);
|
||||
return CSAssignment.NULL_ASSIGNMENT;
|
||||
}
|
||||
// If there was no reservation and canAssignToThisQueue returned
|
||||
// true, there is no reason to check further.
|
||||
if (!this.reservationsContinueLooking
|
||||
|| appReserved.equals(Resources.none())) {
|
||||
needAssignToQueueCheck = false;
|
||||
}
|
||||
}
|
||||
|
||||
CachedUserLimit cul = userLimits.get(application.getUser());
|
||||
Resource cachedUserLimit = null;
|
||||
if (cul != null) {
|
||||
cachedUserLimit = cul.userLimit;
|
||||
}
|
||||
Resource userLimit = computeUserLimitAndSetHeadroom(application,
|
||||
clusterResource, ps.getPartition(), schedulingMode);
|
||||
|
||||
clusterResource, ps.getPartition(), schedulingMode, cachedUserLimit);
|
||||
if (cul == null) {
|
||||
cul = new CachedUserLimit(userLimit);
|
||||
userLimits.put(application.getUser(), cul);
|
||||
}
|
||||
// Check user limit
|
||||
if (!canAssignToUser(clusterResource, application.getUser(), userLimit,
|
||||
application, ps.getPartition(), currentResourceLimits)) {
|
||||
boolean userAssignable = true;
|
||||
if (!cul.canAssign && Resources.fitsIn(appReserved, cul.reservation)) {
|
||||
userAssignable = false;
|
||||
} else {
|
||||
userAssignable = canAssignToUser(clusterResource, application.getUser(),
|
||||
userLimit, application, node.getPartition(), currentResourceLimits);
|
||||
if (!userAssignable && Resources.fitsIn(cul.reservation, appReserved)) {
|
||||
cul.canAssign = false;
|
||||
cul.reservation = appReserved;
|
||||
}
|
||||
}
|
||||
if (!userAssignable) {
|
||||
application.updateAMContainerDiagnostics(AMState.ACTIVATED,
|
||||
"User capacity has reached its maximum limit.");
|
||||
ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
|
||||
@ -1127,7 +1155,7 @@ public boolean accept(Resource cluster,
|
||||
|
||||
// check user-limit
|
||||
Resource userLimit = computeUserLimitAndSetHeadroom(app, cluster, p,
|
||||
allocation.getSchedulingMode());
|
||||
allocation.getSchedulingMode(), null);
|
||||
|
||||
// Deduct resources that we can release
|
||||
Resource usedResource = Resources.clone(getUser(username).getUsed(p));
|
||||
@ -1332,19 +1360,20 @@ private void setQueueResourceLimitsInfo(
|
||||
@Lock({LeafQueue.class})
|
||||
Resource computeUserLimitAndSetHeadroom(FiCaSchedulerApp application,
|
||||
Resource clusterResource, String nodePartition,
|
||||
SchedulingMode schedulingMode) {
|
||||
SchedulingMode schedulingMode, Resource userLimit) {
|
||||
String user = application.getUser();
|
||||
User queueUser = getUser(user);
|
||||
|
||||
// Compute user limit respect requested labels,
|
||||
// TODO, need consider headroom respect labels also
|
||||
Resource userLimit =
|
||||
getResourceLimitForActiveUsers(application.getUser(), clusterResource,
|
||||
nodePartition, schedulingMode);
|
||||
|
||||
if (userLimit == null) {
|
||||
userLimit = getResourceLimitForActiveUsers(application.getUser(),
|
||||
clusterResource, nodePartition, schedulingMode);
|
||||
}
|
||||
setQueueResourceLimitsInfo(clusterResource);
|
||||
|
||||
Resource headroom =
|
||||
metrics.getUserMetrics(user) == null ? Resources.none() :
|
||||
getHeadroom(queueUser, cachedResourceLimitsForHeadroom.getLimit(),
|
||||
clusterResource, userLimit, nodePartition);
|
||||
|
||||
@ -1352,7 +1381,7 @@ Resource computeUserLimitAndSetHeadroom(FiCaSchedulerApp application,
|
||||
LOG.debug("Headroom calculation for user " + user + ": " + " userLimit="
|
||||
+ userLimit + " queueMaxAvailRes="
|
||||
+ cachedResourceLimitsForHeadroom.getLimit() + " consumed="
|
||||
+ queueUser.getUsed() + " headroom=" + headroom + " partition="
|
||||
+ queueUser.getUsed() + " partition="
|
||||
+ nodePartition);
|
||||
}
|
||||
|
||||
@ -1713,7 +1742,7 @@ public void updateClusterResource(Resource clusterResource,
|
||||
.getSchedulableEntities()) {
|
||||
computeUserLimitAndSetHeadroom(application, clusterResource,
|
||||
RMNodeLabelsManager.NO_LABEL,
|
||||
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
|
||||
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null);
|
||||
}
|
||||
} finally {
|
||||
writeLock.unlock();
|
||||
@ -2052,4 +2081,14 @@ public void stopQueue() {
|
||||
public Set<String> getAllUsers() {
|
||||
return this.getUsersManager().getUsers().keySet();
|
||||
}
|
||||
|
||||
static class CachedUserLimit {
|
||||
final Resource userLimit;
|
||||
boolean canAssign = true;
|
||||
Resource reservation = Resources.none();
|
||||
|
||||
CachedUserLimit(Resource userLimit) {
|
||||
this.userLimit = userLimit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -33,9 +33,11 @@
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.BrokenBarrierException;
|
||||
import java.util.concurrent.CyclicBarrier;
|
||||
@ -50,6 +52,7 @@
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.security.token.TokenIdentifier;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.util.Time;
|
||||
import org.apache.hadoop.yarn.LocalConfigurationProvider;
|
||||
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||
@ -90,7 +93,6 @@
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.UpdateNodeResourceRequest;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.AdminService;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.Application;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockNodes;
|
||||
@ -156,8 +158,12 @@
|
||||
import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
|
||||
import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
|
||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Assume;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
@ -3492,6 +3498,7 @@ Collections.<ContainerId> emptyList(), null, null,
|
||||
rm.stop();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeadRoomCalculationWithDRC() throws Exception {
|
||||
// test with total cluster resource of 20GB memory and 20 vcores.
|
||||
@ -4074,6 +4081,143 @@ public void testCSReservationWithRootUnblocked() throws Exception {
|
||||
rm.stop();
|
||||
}
|
||||
|
||||
@Test (timeout = 300000)
|
||||
public void testUserLimitThroughput() throws Exception {
|
||||
// Since this is more of a performance unit test, only run if
|
||||
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
|
||||
Assume.assumeTrue(Boolean.valueOf(
|
||||
System.getProperty("RunUserLimitThroughput")));
|
||||
|
||||
CapacitySchedulerConfiguration csconf =
|
||||
new CapacitySchedulerConfiguration();
|
||||
csconf.setMaximumApplicationMasterResourcePerQueuePercent("root", 100.0f);
|
||||
csconf.setMaximumAMResourcePercentPerPartition("root", "", 100.0f);
|
||||
csconf.setMaximumApplicationMasterResourcePerQueuePercent("root.default",
|
||||
100.0f);
|
||||
csconf.setMaximumAMResourcePercentPerPartition("root.default", "", 100.0f);
|
||||
csconf.setResourceComparator(DominantResourceCalculator.class);
|
||||
|
||||
YarnConfiguration conf = new YarnConfiguration(csconf);
|
||||
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
|
||||
ResourceScheduler.class);
|
||||
|
||||
MockRM rm = new MockRM(conf);
|
||||
rm.start();
|
||||
|
||||
CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
|
||||
LeafQueue qb = (LeafQueue)cs.getQueue("default");
|
||||
|
||||
// For now make user limit large so we can activate all applications
|
||||
qb.setUserLimitFactor((float)100.0);
|
||||
qb.setupConfigurableCapacities();
|
||||
|
||||
SchedulerEvent addAppEvent;
|
||||
SchedulerEvent addAttemptEvent;
|
||||
Container container = mock(Container.class);
|
||||
ApplicationSubmissionContext submissionContext =
|
||||
mock(ApplicationSubmissionContext.class);
|
||||
|
||||
final int appCount = 100;
|
||||
ApplicationId[] appids = new ApplicationId[appCount];
|
||||
RMAppAttemptImpl[] attempts = new RMAppAttemptImpl[appCount];
|
||||
ApplicationAttemptId[] appAttemptIds = new ApplicationAttemptId[appCount];
|
||||
RMAppImpl[] apps = new RMAppImpl[appCount];
|
||||
RMAppAttemptMetrics[] attemptMetrics = new RMAppAttemptMetrics[appCount];
|
||||
for (int i=0; i<appCount; i++) {
|
||||
appids[i] = BuilderUtils.newApplicationId(100, i);
|
||||
appAttemptIds[i] =
|
||||
BuilderUtils.newApplicationAttemptId(appids[i], 1);
|
||||
|
||||
attemptMetrics[i] =
|
||||
new RMAppAttemptMetrics(appAttemptIds[i], rm.getRMContext());
|
||||
apps[i] = mock(RMAppImpl.class);
|
||||
when(apps[i].getApplicationId()).thenReturn(appids[i]);
|
||||
attempts[i] = mock(RMAppAttemptImpl.class);
|
||||
when(attempts[i].getMasterContainer()).thenReturn(container);
|
||||
when(attempts[i].getSubmissionContext()).thenReturn(submissionContext);
|
||||
when(attempts[i].getAppAttemptId()).thenReturn(appAttemptIds[i]);
|
||||
when(attempts[i].getRMAppAttemptMetrics()).thenReturn(attemptMetrics[i]);
|
||||
when(apps[i].getCurrentAppAttempt()).thenReturn(attempts[i]);
|
||||
|
||||
rm.getRMContext().getRMApps().put(appids[i], apps[i]);
|
||||
addAppEvent =
|
||||
new AppAddedSchedulerEvent(appids[i], "default", "user1");
|
||||
cs.handle(addAppEvent);
|
||||
addAttemptEvent =
|
||||
new AppAttemptAddedSchedulerEvent(appAttemptIds[i], false);
|
||||
cs.handle(addAttemptEvent);
|
||||
}
|
||||
|
||||
// add nodes to cluster, so cluster has 20GB and 20 vcores
|
||||
Resource newResource = Resource.newInstance(10 * GB, 10);
|
||||
RMNode node = MockNodes.newNodeInfo(0, newResource, 1, "127.0.0.1");
|
||||
cs.handle(new NodeAddedSchedulerEvent(node));
|
||||
|
||||
Resource newResource2 = Resource.newInstance(10 * GB, 10);
|
||||
RMNode node2 = MockNodes.newNodeInfo(0, newResource2, 1, "127.0.0.2");
|
||||
cs.handle(new NodeAddedSchedulerEvent(node2));
|
||||
|
||||
Priority u0Priority = TestUtils.createMockPriority(1);
|
||||
RecordFactory recordFactory =
|
||||
RecordFactoryProvider.getRecordFactory(null);
|
||||
|
||||
FiCaSchedulerApp[] fiCaApps = new FiCaSchedulerApp[appCount];
|
||||
for (int i=0;i<appCount;i++) {
|
||||
fiCaApps[i] =
|
||||
cs.getSchedulerApplications().get(apps[i].getApplicationId())
|
||||
.getCurrentAppAttempt();
|
||||
// allocate container for app2 with 1GB memory and 1 vcore
|
||||
fiCaApps[i].updateResourceRequests(Collections.singletonList(
|
||||
TestUtils.createResourceRequest(ResourceRequest.ANY, 1*GB, 1, true,
|
||||
u0Priority, recordFactory)));
|
||||
}
|
||||
// Now force everything to be over user limit
|
||||
qb.setUserLimitFactor((float)0.0);
|
||||
|
||||
// Quiet the loggers while measuring throughput
|
||||
for (Enumeration<?> loggers=LogManager.getCurrentLoggers();
|
||||
loggers.hasMoreElements(); ) {
|
||||
Logger logger = (Logger) loggers.nextElement();
|
||||
logger.setLevel(Level.WARN);
|
||||
}
|
||||
final int topn = 20;
|
||||
final int iterations = 2000000;
|
||||
final int printInterval = 20000;
|
||||
final float numerator = 1000.0f * printInterval;
|
||||
PriorityQueue<Long> queue = new PriorityQueue<>(topn,
|
||||
Collections.reverseOrder());
|
||||
|
||||
long n = Time.monotonicNow();
|
||||
long timespent = 0;
|
||||
for (int i = 0; i < iterations; i+=2) {
|
||||
if (i > 0 && i % printInterval == 0){
|
||||
long ts = (Time.monotonicNow() - n);
|
||||
if (queue.size() < topn) {
|
||||
queue.offer(ts);
|
||||
} else {
|
||||
Long last = queue.peek();
|
||||
if (last > ts) {
|
||||
queue.poll();
|
||||
queue.offer(ts);
|
||||
}
|
||||
}
|
||||
System.out.println(i + " " + (numerator / ts));
|
||||
n= Time.monotonicNow();
|
||||
}
|
||||
cs.handle(new NodeUpdateSchedulerEvent(node));
|
||||
cs.handle(new NodeUpdateSchedulerEvent(node2));
|
||||
}
|
||||
timespent=0;
|
||||
int entries = queue.size();
|
||||
while(queue.size() > 0){
|
||||
long l = queue.poll();
|
||||
timespent += l;
|
||||
}
|
||||
System.out.println("Avg of fastest " + entries + ": "
|
||||
+ numerator / (timespent / entries));
|
||||
rm.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCSQueueBlocked() throws Exception {
|
||||
CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
|
||||
|
@ -1146,7 +1146,7 @@ public void testComputeUserLimitAndSetHeadroom() throws IOException {
|
||||
new ResourceLimits(clusterResource),
|
||||
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), qb, nodes, apps);
|
||||
qb.computeUserLimitAndSetHeadroom(app_0, clusterResource,
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null);
|
||||
|
||||
//maxqueue 16G, userlimit 13G, - 4G used = 9G
|
||||
assertEquals(9*GB,app_0.getHeadroom().getMemorySize());
|
||||
@ -1169,7 +1169,7 @@ public void testComputeUserLimitAndSetHeadroom() throws IOException {
|
||||
new ResourceLimits(clusterResource),
|
||||
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), qb, nodes, apps);
|
||||
qb.computeUserLimitAndSetHeadroom(app_0, clusterResource,
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null);
|
||||
|
||||
assertEquals(8*GB, qb.getUsedResources().getMemorySize());
|
||||
assertEquals(4*GB, app_0.getCurrentConsumption().getMemorySize());
|
||||
@ -1219,7 +1219,7 @@ public void testComputeUserLimitAndSetHeadroom() throws IOException {
|
||||
new ResourceLimits(clusterResource),
|
||||
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), qb, nodes, apps);
|
||||
qb.computeUserLimitAndSetHeadroom(app_3, clusterResource,
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null);
|
||||
assertEquals(4*GB, qb.getUsedResources().getMemorySize());
|
||||
//maxqueue 16G, userlimit 7G, used (by each user) 2G, headroom 5G (both)
|
||||
assertEquals(5*GB, app_3.getHeadroom().getMemorySize());
|
||||
@ -1240,9 +1240,9 @@ public void testComputeUserLimitAndSetHeadroom() throws IOException {
|
||||
new ResourceLimits(clusterResource),
|
||||
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), qb, nodes, apps);
|
||||
qb.computeUserLimitAndSetHeadroom(app_4, clusterResource,
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null);
|
||||
qb.computeUserLimitAndSetHeadroom(app_3, clusterResource,
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
|
||||
"", SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null);
|
||||
|
||||
|
||||
//app3 is user1, active from last test case
|
||||
|
Loading…
Reference in New Issue
Block a user