YARN-2022 Preempting an Application Master container can be kept as least priority when multiple applications are marked for preemption by ProportionalCapacityPreemptionPolicy (Sunil G via mayank)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1607227 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mayank Bansal 2014-07-02 01:54:47 +00:00
parent 075ff276ca
commit 03a25d2cc1
9 changed files with 282 additions and 15 deletions

View File

@ -56,6 +56,10 @@ Release 2.5.0 - UNRELEASED
IMPROVEMENTS IMPROVEMENTS
YARN-2022 Preempting an Application Master container can be kept as least priority
when multiple applications are marked for preemption by
ProportionalCapacityPreemptionPolicy (Sunil G via mayank)
YARN-1479. Invalid NaN values in Hadoop REST API JSON response (Chen He via YARN-1479. Invalid NaN values in Hadoop REST API JSON response (Chen He via
jeagles) jeagles)

View File

@ -437,8 +437,9 @@ private void resetCapacity(ResourceCalculator rc, Resource clusterResource,
private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt( private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt(
List<TempQueue> queues, Resource clusterResource) { List<TempQueue> queues, Resource clusterResource) {
Map<ApplicationAttemptId,Set<RMContainer>> list = Map<ApplicationAttemptId,Set<RMContainer>> preemptMap =
new HashMap<ApplicationAttemptId,Set<RMContainer>>(); new HashMap<ApplicationAttemptId,Set<RMContainer>>();
List<RMContainer> skippedAMContainerlist = new ArrayList<RMContainer>();
for (TempQueue qT : queues) { for (TempQueue qT : queues) {
// we act only if we are violating balance by more than // we act only if we are violating balance by more than
@ -449,6 +450,7 @@ private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt(
// accounts for natural termination of containers // accounts for natural termination of containers
Resource resToObtain = Resource resToObtain =
Resources.multiply(qT.toBePreempted, naturalTerminationFactor); Resources.multiply(qT.toBePreempted, naturalTerminationFactor);
Resource skippedAMSize = Resource.newInstance(0, 0);
// lock the leafqueue while we scan applications and unreserve // lock the leafqueue while we scan applications and unreserve
synchronized (qT.leafQueue) { synchronized (qT.leafQueue) {
@ -458,17 +460,73 @@ private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt(
qT.actuallyPreempted = Resources.clone(resToObtain); qT.actuallyPreempted = Resources.clone(resToObtain);
while (desc.hasNext()) { while (desc.hasNext()) {
FiCaSchedulerApp fc = desc.next(); FiCaSchedulerApp fc = desc.next();
if (Resources.lessThanOrEqual(rc, clusterResource, if (Resources.lessThanOrEqual(rc, clusterResource, resToObtain,
resToObtain, Resources.none())) { Resources.none())) {
break; break;
} }
list.put(fc.getApplicationAttemptId(), preemptMap.put(
preemptFrom(fc, clusterResource, resToObtain)); fc.getApplicationAttemptId(),
preemptFrom(fc, clusterResource, resToObtain,
skippedAMContainerlist, skippedAMSize));
}
Resource maxAMCapacityForThisQueue = Resources.multiply(
Resources.multiply(clusterResource,
qT.leafQueue.getAbsoluteCapacity()),
qT.leafQueue.getMaxAMResourcePerQueuePercent());
// Can try preempting AMContainers (still saving atmost
// maxAMCapacityForThisQueue AMResource's) if more resources are
// required to be preempted from this Queue.
preemptAMContainers(clusterResource, preemptMap,
skippedAMContainerlist, resToObtain, skippedAMSize,
maxAMCapacityForThisQueue);
} }
} }
} }
return preemptMap;
} }
return list;
/**
* As more resources are needed for preemption, saved AMContainers has to be
* rescanned. Such AMContainers can be preempted based on resToObtain, but
* maxAMCapacityForThisQueue resources will be still retained.
*
* @param clusterResource
* @param preemptMap
* @param skippedAMContainerlist
* @param resToObtain
* @param skippedAMSize
* @param maxAMCapacityForThisQueue
*/
private void preemptAMContainers(Resource clusterResource,
Map<ApplicationAttemptId, Set<RMContainer>> preemptMap,
List<RMContainer> skippedAMContainerlist, Resource resToObtain,
Resource skippedAMSize, Resource maxAMCapacityForThisQueue) {
for (RMContainer c : skippedAMContainerlist) {
// Got required amount of resources for preemption, can stop now
if (Resources.lessThanOrEqual(rc, clusterResource, resToObtain,
Resources.none())) {
break;
}
// Once skippedAMSize reaches down to maxAMCapacityForThisQueue,
// container selection iteration for preemption will be stopped.
if (Resources.lessThanOrEqual(rc, clusterResource, skippedAMSize,
maxAMCapacityForThisQueue)) {
break;
}
Set<RMContainer> contToPrempt = preemptMap.get(c
.getApplicationAttemptId());
if (null == contToPrempt) {
contToPrempt = new HashSet<RMContainer>();
preemptMap.put(c.getApplicationAttemptId(), contToPrempt);
}
contToPrempt.add(c);
Resources.subtractFrom(resToObtain, c.getContainer().getResource());
Resources.subtractFrom(skippedAMSize, c.getContainer()
.getResource());
}
skippedAMContainerlist.clear();
} }
/** /**
@ -480,8 +538,9 @@ private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt(
* @param rsrcPreempt * @param rsrcPreempt
* @return * @return
*/ */
private Set<RMContainer> preemptFrom( private Set<RMContainer> preemptFrom(FiCaSchedulerApp app,
FiCaSchedulerApp app, Resource clusterResource, Resource rsrcPreempt) { Resource clusterResource, Resource rsrcPreempt,
List<RMContainer> skippedAMContainerlist, Resource skippedAMSize) {
Set<RMContainer> ret = new HashSet<RMContainer>(); Set<RMContainer> ret = new HashSet<RMContainer>();
ApplicationAttemptId appId = app.getApplicationAttemptId(); ApplicationAttemptId appId = app.getApplicationAttemptId();
@ -513,6 +572,12 @@ private Set<RMContainer> preemptFrom(
rsrcPreempt, Resources.none())) { rsrcPreempt, Resources.none())) {
return ret; return ret;
} }
// Skip AM Container from preemption for now.
if (c.isAMContainer()) {
skippedAMContainerlist.add(c);
Resources.addTo(skippedAMSize, c.getContainer().getResource());
continue;
}
ret.add(c); ret.add(c);
Resources.subtractFrom(rsrcPreempt, c.getContainer().getResource()); Resources.subtractFrom(rsrcPreempt, c.getContainer().getResource());
} }

View File

@ -84,6 +84,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUpdateSavedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUpdateSavedEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent;
@ -833,6 +834,9 @@ public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
// Set the masterContainer // Set the masterContainer
appAttempt.setMasterContainer(amContainerAllocation.getContainers() appAttempt.setMasterContainer(amContainerAllocation.getContainers()
.get(0)); .get(0));
RMContainerImpl rmMasterContainer = (RMContainerImpl)appAttempt.scheduler
.getRMContainer(appAttempt.getMasterContainer().getId());
rmMasterContainer.setAMContainer(true);
// The node set in NMTokenSecrentManager is used for marking whether the // The node set in NMTokenSecrentManager is used for marking whether the
// NMToken has been issued for this node to the AM. // NMToken has been issued for this node to the AM.
// When AM container was allocated to RM itself, the node which allocates // When AM container was allocated to RM itself, the node which allocates

View File

@ -72,4 +72,6 @@ public interface RMContainer extends EventHandler<RMContainerEvent> {
ContainerReport createContainerReport(); ContainerReport createContainerReport();
boolean isAMContainer();
} }

View File

@ -155,6 +155,7 @@ RMContainerEventType.RELEASED, new KillTransition())
private long creationTime; private long creationTime;
private long finishTime; private long finishTime;
private ContainerStatus finishedStatus; private ContainerStatus finishedStatus;
private boolean isAMContainer;
public RMContainerImpl(Container container, public RMContainerImpl(Container container,
ApplicationAttemptId appAttemptId, NodeId nodeId, String user, ApplicationAttemptId appAttemptId, NodeId nodeId, String user,
@ -176,6 +177,7 @@ public RMContainerImpl(Container container,
this.rmContext = rmContext; this.rmContext = rmContext;
this.eventHandler = rmContext.getDispatcher().getEventHandler(); this.eventHandler = rmContext.getDispatcher().getEventHandler();
this.containerAllocationExpirer = rmContext.getContainerAllocationExpirer(); this.containerAllocationExpirer = rmContext.getContainerAllocationExpirer();
this.isAMContainer = false;
ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
this.readLock = lock.readLock(); this.readLock = lock.readLock();
@ -313,6 +315,25 @@ public String toString() {
return containerId.toString(); return containerId.toString();
} }
@Override
public boolean isAMContainer() {
try {
readLock.lock();
return isAMContainer;
} finally {
readLock.unlock();
}
}
public void setAMContainer(boolean isAMContainer) {
try {
writeLock.lock();
this.isAMContainer = isAMContainer;
} finally {
writeLock.unlock();
}
}
@Override @Override
public void handle(RMContainerEvent event) { public void handle(RMContainerEvent event) {
LOG.debug("Processing " + event.getContainerId() + " of type " + event.getType()); LOG.debug("Processing " + event.getContainerId() + " of type " + event.getType());
@ -490,5 +511,4 @@ public ContainerReport createContainerReport() {
} }
return containerReport; return containerReport;
} }
} }

View File

@ -39,6 +39,7 @@
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent;
@ -242,6 +243,20 @@ public synchronized void recoverContainersOnNode(
// recover scheduler attempt // recover scheduler attempt
schedulerAttempt.recoverContainer(rmContainer); schedulerAttempt.recoverContainer(rmContainer);
// set master container for the current running AMContainer for this
// attempt.
RMAppAttempt appAttempt = rmApp.getCurrentAppAttempt();
if (appAttempt != null) {
Container masterContainer = appAttempt.getMasterContainer();
// Mark current running AMContainer's RMContainer based on the master
// container ID stored in AppAttempt.
if (masterContainer != null
&& masterContainer.getId().equals(rmContainer.getContainerId())) {
((RMContainerImpl)rmContainer).setAMContainer(true);
}
}
} }
} }

View File

@ -62,6 +62,7 @@
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.junit.After; import org.junit.After;
import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
@ -565,6 +566,43 @@ public void testAppReregisterOnRMWorkPreservingRestart() throws Exception {
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.RUNNING); rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
} }
@Test (timeout = 30000)
public void testAMContainerStatusWithRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 =
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1_1 = rm1.submitApp(1024);
MockAM am1_1 = MockRM.launchAndRegisterAM(app1_1, rm1, nm1);
RMAppAttempt attempt0 = app1_1.getCurrentAppAttempt();
AbstractYarnScheduler scheduler =
((AbstractYarnScheduler) rm1.getResourceScheduler());
Assert.assertTrue(scheduler.getRMContainer(
attempt0.getMasterContainer().getId()).isAMContainer());
// Re-start RM
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
List<NMContainerStatus> am1_1Containers =
createNMContainerStatusForApp(am1_1);
nm1.registerNode(am1_1Containers, null);
// Wait for RM to settle down on recovering containers;
waitForNumContainersToRecover(2, rm2, am1_1.getApplicationAttemptId());
scheduler = ((AbstractYarnScheduler) rm2.getResourceScheduler());
Assert.assertTrue(scheduler.getRMContainer(
attempt0.getMasterContainer().getId()).isAMContainer());
}
private void asserteMetrics(QueueMetrics qm, int appsSubmitted, private void asserteMetrics(QueueMetrics qm, int appsSubmitted,
int appsPending, int appsRunning, int appsCompleted, int appsPending, int appsRunning, int appsCompleted,
int allocatedContainers, int availableMB, int availableVirtualCores, int allocatedContainers, int availableMB, int availableVirtualCores,

View File

@ -80,6 +80,8 @@ public class TestProportionalCapacityPreemptionPolicy {
static final long TS = 3141592653L; static final long TS = 3141592653L;
int appAlloc = 0; int appAlloc = 0;
boolean setAMContainer = false;
float setAMResourcePercent = 0.0f;
Random rand = null; Random rand = null;
Clock mClock = null; Clock mClock = null;
Configuration conf = null; Configuration conf = null;
@ -467,6 +469,107 @@ public void testPolicyInitializeAfterSchedulerInitialized() {
fail("Failed to find SchedulingMonitor service, please check what happened"); fail("Failed to find SchedulingMonitor service, please check what happened");
} }
@Test
public void testSkipAMContainer() {
int[][] qData = new int[][] {
// / A B
{ 100, 50, 50 }, // abs
{ 100, 100, 100 }, // maxcap
{ 100, 100, 0 }, // used
{ 70, 20, 50 }, // pending
{ 0, 0, 0 }, // reserved
{ 5, 4, 1 }, // apps
{ -1, 1, 1 }, // req granularity
{ 2, 0, 0 }, // subqueues
};
setAMContainer = true;
ProportionalCapacityPreemptionPolicy policy = buildPolicy(qData);
policy.editSchedule();
// By skipping AM Container, all other 24 containers of appD will be
// preempted
verify(mDisp, times(24)).handle(argThat(new IsPreemptionRequestFor(appD)));
// By skipping AM Container, all other 24 containers of appC will be
// preempted
verify(mDisp, times(24)).handle(argThat(new IsPreemptionRequestFor(appC)));
// Since AM containers of appC and appD are saved, 2 containers from appB
// has to be preempted.
verify(mDisp, times(2)).handle(argThat(new IsPreemptionRequestFor(appB)));
setAMContainer = false;
}
@Test
public void testPreemptSkippedAMContainers() {
int[][] qData = new int[][] {
// / A B
{ 100, 10, 90 }, // abs
{ 100, 100, 100 }, // maxcap
{ 100, 100, 0 }, // used
{ 70, 20, 90 }, // pending
{ 0, 0, 0 }, // reserved
{ 5, 4, 1 }, // apps
{ -1, 5, 5 }, // req granularity
{ 2, 0, 0 }, // subqueues
};
setAMContainer = true;
ProportionalCapacityPreemptionPolicy policy = buildPolicy(qData);
policy.editSchedule();
// All 5 containers of appD will be preempted including AM container.
verify(mDisp, times(5)).handle(argThat(new IsPreemptionRequestFor(appD)));
// All 5 containers of appC will be preempted including AM container.
verify(mDisp, times(5)).handle(argThat(new IsPreemptionRequestFor(appC)));
// By skipping AM Container, all other 4 containers of appB will be
// preempted
verify(mDisp, times(4)).handle(argThat(new IsPreemptionRequestFor(appB)));
// By skipping AM Container, all other 4 containers of appA will be
// preempted
verify(mDisp, times(4)).handle(argThat(new IsPreemptionRequestFor(appA)));
setAMContainer = false;
}
@Test
public void testAMResourcePercentForSkippedAMContainers() {
int[][] qData = new int[][] {
// / A B
{ 100, 10, 90 }, // abs
{ 100, 100, 100 }, // maxcap
{ 100, 100, 0 }, // used
{ 70, 20, 90 }, // pending
{ 0, 0, 0 }, // reserved
{ 5, 4, 1 }, // apps
{ -1, 5, 5 }, // req granularity
{ 2, 0, 0 }, // subqueues
};
setAMContainer = true;
setAMResourcePercent = 0.5f;
ProportionalCapacityPreemptionPolicy policy = buildPolicy(qData);
policy.editSchedule();
// AMResoucePercent is 50% of cluster and maxAMCapacity will be 5Gb.
// Total used AM container size is 20GB, hence 2 AM container has
// to be preempted as Queue Capacity is 10Gb.
verify(mDisp, times(5)).handle(argThat(new IsPreemptionRequestFor(appD)));
// Including AM Container, all other 4 containers of appC will be
// preempted
verify(mDisp, times(5)).handle(argThat(new IsPreemptionRequestFor(appC)));
// By skipping AM Container, all other 4 containers of appB will be
// preempted
verify(mDisp, times(4)).handle(argThat(new IsPreemptionRequestFor(appB)));
// By skipping AM Container, all other 4 containers of appA will be
// preempted
verify(mDisp, times(4)).handle(argThat(new IsPreemptionRequestFor(appA)));
setAMContainer = false;
}
static class IsPreemptionRequestFor static class IsPreemptionRequestFor
extends ArgumentMatcher<ContainerPreemptEvent> { extends ArgumentMatcher<ContainerPreemptEvent> {
private final ApplicationAttemptId appAttId; private final ApplicationAttemptId appAttId;
@ -583,6 +686,9 @@ public int compare(FiCaSchedulerApp a1, FiCaSchedulerApp a2) {
} }
} }
when(lq.getApplications()).thenReturn(qApps); when(lq.getApplications()).thenReturn(qApps);
if(setAMResourcePercent != 0.0f){
when(lq.getMaxAMResourcePerQueuePercent()).thenReturn(setAMResourcePercent);
}
p.getChildQueues().add(lq); p.getChildQueues().add(lq);
return lq; return lq;
} }
@ -607,7 +713,11 @@ FiCaSchedulerApp mockApp(int qid, int id, int used, int pending, int reserved,
List<RMContainer> cLive = new ArrayList<RMContainer>(); List<RMContainer> cLive = new ArrayList<RMContainer>();
for (int i = 0; i < used; i += gran) { for (int i = 0; i < used; i += gran) {
if(setAMContainer && i == 0){
cLive.add(mockContainer(appAttId, cAlloc, unit, 0));
}else{
cLive.add(mockContainer(appAttId, cAlloc, unit, 1)); cLive.add(mockContainer(appAttId, cAlloc, unit, 1));
}
++cAlloc; ++cAlloc;
} }
when(app.getLiveContainers()).thenReturn(cLive); when(app.getLiveContainers()).thenReturn(cLive);
@ -623,6 +733,10 @@ RMContainer mockContainer(ApplicationAttemptId appAttId, int id,
RMContainer mC = mock(RMContainer.class); RMContainer mC = mock(RMContainer.class);
when(mC.getContainerId()).thenReturn(cId); when(mC.getContainerId()).thenReturn(cId);
when(mC.getContainer()).thenReturn(c); when(mC.getContainer()).thenReturn(c);
when(mC.getApplicationAttemptId()).thenReturn(appAttId);
if(0 == priority){
when(mC.isAMContainer()).thenReturn(true);
}
return mC; return mC;
} }

View File

@ -86,6 +86,8 @@
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUpdateSavedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUpdateSavedEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent;
@ -600,6 +602,9 @@ private Container allocateApplicationAttempt() {
any(List.class), any(List.class),
any(List.class))). any(List.class))).
thenReturn(allocation); thenReturn(allocation);
RMContainer rmContainer = mock(RMContainerImpl.class);
when(scheduler.getRMContainer(container.getId())).
thenReturn(rmContainer);
applicationAttempt.handle( applicationAttempt.handle(
new RMAppAttemptContainerAllocatedEvent( new RMAppAttemptContainerAllocatedEvent(