YARN-6042. Dump scheduler and queue state information into FairScheduler DEBUG log. (Yufei Gu via rchiang)

This commit is contained in:
Ray Chiang 2017-03-10 16:13:31 -08:00
parent 229c7c9f89
commit 4db9cc70d0
7 changed files with 206 additions and 34 deletions

View File

@ -321,3 +321,12 @@ log4j.appender.EWMA=org.apache.hadoop.yarn.util.Log4jWarningErrorMetricsAppender
log4j.appender.EWMA.cleanupInterval=${yarn.ewma.cleanupInterval} log4j.appender.EWMA.cleanupInterval=${yarn.ewma.cleanupInterval}
log4j.appender.EWMA.messageAgeLimitSeconds=${yarn.ewma.messageAgeLimitSeconds} log4j.appender.EWMA.messageAgeLimitSeconds=${yarn.ewma.messageAgeLimitSeconds}
log4j.appender.EWMA.maxUniqueMessages=${yarn.ewma.maxUniqueMessages} log4j.appender.EWMA.maxUniqueMessages=${yarn.ewma.maxUniqueMessages}
# Fair scheduler requests log on state dump
log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.statedump=DEBUG,FSLOGGER
log4j.appender.FSLOGGER=org.apache.log4j.RollingFileAppender
log4j.appender.FSLOGGER.File=${hadoop.log.dir}/fairscheduler-statedump.log
log4j.appender.FSLOGGER.layout=org.apache.log4j.PatternLayout
log4j.appender.FSLOGGER.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.FSLOGGER.MaxFileSize=${hadoop.log.maxfilesize}
log4j.appender.FSLOGGER.MaxBackupIndex=${hadoop.log.maxbackupindex}

View File

@ -835,25 +835,27 @@ private Resource assignContainer(
return capability; return capability;
} }
if (LOG.isDebugEnabled()) {
LOG.debug("Resource request: " + capability + " exceeds the available"
+ " resources of the node.");
}
// The desired container won't fit here, so reserve // The desired container won't fit here, so reserve
if (isReservable(capability) && if (isReservable(capability) &&
reserve(pendingAsk.getPerAllocationResource(), node, reservedContainer, reserve(pendingAsk.getPerAllocationResource(), node, reservedContainer,
type, schedulerKey)) { type, schedulerKey)) {
if (isWaitingForAMContainer()) { updateAMDiagnosticMsg(capability, " exceeds the available resources of "
updateAMDiagnosticMsg(capability, + "the node and the request is reserved)");
" exceed the available resources of the node and the request is" if (LOG.isDebugEnabled()) {
+ " reserved"); LOG.debug(getName() + "'s resource request is reserved.");
} }
return FairScheduler.CONTAINER_RESERVED; return FairScheduler.CONTAINER_RESERVED;
} else { } else {
if (isWaitingForAMContainer()) { updateAMDiagnosticMsg(capability, " exceeds the available resources of "
updateAMDiagnosticMsg(capability, + "the node and the request cannot be reserved)");
" exceed the available resources of the node and the request cannot"
+ " be reserved");
}
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Couldn't creating reservation for " + LOG.debug("Couldn't create reservation for app: " + getName()
getName() + ",at priority " + schedulerKey.getPriority()); + ", at priority " + schedulerKey.getPriority());
} }
return Resources.none(); return Resources.none();
} }
@ -1034,10 +1036,9 @@ private boolean hasContainerForNode(SchedulerRequestKey key,
ret = false; ret = false;
} else if (!getQueue().fitsInMaxShare(resource)) { } else if (!getQueue().fitsInMaxShare(resource)) {
// The requested container must fit in queue maximum share // The requested container must fit in queue maximum share
if (isWaitingForAMContainer()) { updateAMDiagnosticMsg(resource,
updateAMDiagnosticMsg(resource, " exceeds current queue or its parents maximum resource allowed).");
" exceeds current queue or its parents maximum resource allowed).");
}
ret = false; ret = false;
} }
@ -1309,15 +1310,13 @@ public void updateDemand() {
@Override @Override
public Resource assignContainer(FSSchedulerNode node) { public Resource assignContainer(FSSchedulerNode node) {
if (isOverAMShareLimit()) { if (isOverAMShareLimit()) {
if (isWaitingForAMContainer()) { PendingAsk amAsk = appSchedulingInfo.getNextPendingAsk();
PendingAsk amAsk = appSchedulingInfo.getNextPendingAsk(); updateAMDiagnosticMsg(amAsk.getPerAllocationResource(),
updateAMDiagnosticMsg(amAsk.getPerAllocationResource(), " exceeds maximum AM resource allowed).");
" exceeds maximum AM resource allowed).");
}
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Skipping allocation because maxAMShare limit would " + LOG.debug("AM resource request: " + amAsk.getPerAllocationResource()
"be exceeded"); + " exceeds maximum AM resource allowed, "
+ getQueue().dumpState());
} }
return Resources.none(); return Resources.none();
} }
@ -1331,6 +1330,10 @@ public Resource assignContainer(FSSchedulerNode node) {
* @param reason the reason why AM doesn't get the resource * @param reason the reason why AM doesn't get the resource
*/ */
private void updateAMDiagnosticMsg(Resource resource, String reason) { private void updateAMDiagnosticMsg(Resource resource, String reason) {
if (!isWaitingForAMContainer()) {
return;
}
StringBuilder diagnosticMessageBldr = new StringBuilder(); StringBuilder diagnosticMessageBldr = new StringBuilder();
diagnosticMessageBldr.append(" (Resource request: "); diagnosticMessageBldr.append(" (Resource request: ");
diagnosticMessageBldr.append(resource); diagnosticMessageBldr.append(resource);

View File

@ -617,4 +617,25 @@ private boolean isStarvedForFairShare() {
boolean isStarved() { boolean isStarved() {
return isStarvedForMinShare() || isStarvedForFairShare(); return isStarvedForMinShare() || isStarvedForFairShare();
} }
@Override
protected void dumpStateInternal(StringBuilder sb) {
sb.append("{Name: " + getName() +
", Weight: " + weights +
", Policy: " + policy.getName() +
", FairShare: " + getFairShare() +
", SteadyFairShare: " + getSteadyFairShare() +
", MaxShare: " + maxShare +
", MinShare: " + minShare +
", ResourceUsage: " + getResourceUsage() +
", Demand: " + getDemand() +
", Runnable: " + getNumRunnableApps() +
", NumPendingApps: " + getNumPendingApps() +
", NonRunnable: " + getNumNonRunnableApps() +
", MaxAMShare: " + maxAMShare +
", MaxAMResource: " + computeMaxAMResource() +
", AMResourceUsage: " + getAmResourceUsage() +
", LastTimeAtMinShare: " + lastTimeAtMinShare +
"}");
}
} }

View File

@ -292,4 +292,25 @@ public void recoverContainer(Resource clusterResource,
// TODO Auto-generated method stub // TODO Auto-generated method stub
} }
@Override
protected void dumpStateInternal(StringBuilder sb) {
sb.append("{Name: " + getName() +
", Weight: " + weights +
", Policy: " + policy.getName() +
", FairShare: " + getFairShare() +
", SteadyFairShare: " + getSteadyFairShare() +
", MaxShare: " + maxShare +
", MinShare: " + minShare +
", ResourceUsage: " + getResourceUsage() +
", Demand: " + getDemand() +
", MaxAMShare: " + maxAMShare +
", Runnable: " + getNumRunnableApps() +
"}");
for(FSQueue child : getChildQueues()) {
sb.append(", ");
child.dumpStateInternal(sb);
}
}
} }

View File

@ -393,11 +393,22 @@ public abstract void collectSchedulerApplications(
* @return true if check passes (can assign) or false otherwise * @return true if check passes (can assign) or false otherwise
*/ */
boolean assignContainerPreCheck(FSSchedulerNode node) { boolean assignContainerPreCheck(FSSchedulerNode node) {
if (!Resources.fitsIn(getResourceUsage(), maxShare) if (node.getReservedContainer() != null) {
|| node.getReservedContainer() != null) { if (LOG.isDebugEnabled()) {
LOG.debug("Assigning container failed on node '" + node.getNodeName()
+ " because it has reserved containers.");
}
return false; return false;
} else if (!Resources.fitsIn(getResourceUsage(), maxShare)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Assigning container failed on node '" + node.getNodeName()
+ " because queue resource usage is larger than MaxShare: "
+ dumpState());
}
return false;
} else {
return true;
} }
return true;
} }
/** /**
@ -453,6 +464,11 @@ boolean fitsInMaxShare(Resource additionalResource) {
Resources.add(getResourceUsage(), additionalResource); Resources.add(getResourceUsage(), additionalResource);
if (!Resources.fitsIn(usagePlusAddition, getMaxShare())) { if (!Resources.fitsIn(usagePlusAddition, getMaxShare())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Resource usage plus resource request: " + usagePlusAddition
+ " exceeds maximum resource allowed:" + getMaxShare()
+ " in queue " + getName());
}
return false; return false;
} }
@ -491,4 +507,23 @@ public boolean verifyAndSetPolicyFromConf(AllocationConfiguration queueConf) {
setPolicy(queuePolicy); setPolicy(queuePolicy);
return true; return true;
} }
/**
* Recursively dump states of all queues.
*
* @return a string which holds all queue states
*/
public String dumpState() {
StringBuilder sb = new StringBuilder();
dumpStateInternal(sb);
return sb.toString();
}
/**
* Recursively dump states of all queues.
*
* @param sb the {code StringBuilder} which holds queue states
*/
protected abstract void dumpStateInternal(StringBuilder sb);
} }

View File

@ -139,7 +139,9 @@ public class FairScheduler extends
private boolean usePortForNodeName; private boolean usePortForNodeName;
private static final Log LOG = LogFactory.getLog(FairScheduler.class); private static final Log LOG = LogFactory.getLog(FairScheduler.class);
private static final Log STATE_DUMP_LOG =
LogFactory.getLog(FairScheduler.class.getName() + ".statedump");
private static final ResourceCalculator RESOURCE_CALCULATOR = private static final ResourceCalculator RESOURCE_CALCULATOR =
new DefaultResourceCalculator(); new DefaultResourceCalculator();
private static final ResourceCalculator DOMINANT_RESOURCE_CALCULATOR = private static final ResourceCalculator DOMINANT_RESOURCE_CALCULATOR =
@ -151,7 +153,7 @@ public class FairScheduler extends
// How often fair shares are re-calculated (ms) // How often fair shares are re-calculated (ms)
protected long updateInterval; protected long updateInterval;
private final int UPDATE_DEBUG_FREQUENCY = 5; private final int UPDATE_DEBUG_FREQUENCY = 25;
private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY; private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;
@VisibleForTesting @VisibleForTesting
@ -344,6 +346,21 @@ public void run() {
} }
} }
/**
* Dump scheduler state including states of all queues.
*/
private void dumpSchedulerState() {
FSQueue rootQueue = queueMgr.getRootQueue();
Resource clusterResource = getClusterResource();
LOG.debug("FairScheduler state: Cluster Capacity: " + clusterResource +
" Allocations: " + rootMetrics.getAllocatedResources() +
" Availability: " + Resource.newInstance(
rootMetrics.getAvailableMB(), rootMetrics.getAvailableVirtualCores()) +
" Demand: " + rootQueue.getDemand());
STATE_DUMP_LOG.debug(rootQueue.dumpState());
}
/** /**
* Recompute the internal variables used by the scheduler - per-job weights, * Recompute the internal variables used by the scheduler - per-job weights,
* fair shares, deficits, minimum slot allocations, and amount of used and * fair shares, deficits, minimum slot allocations, and amount of used and
@ -368,12 +385,7 @@ public void update() {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
if (--updatesToSkipForDebug < 0) { if (--updatesToSkipForDebug < 0) {
updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY; updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;
LOG.debug("Cluster Capacity: " + clusterResource + dumpSchedulerState();
" Allocations: " + rootMetrics.getAllocatedResources() +
" Availability: " + Resource.newInstance(
rootMetrics.getAvailableMB(),
rootMetrics.getAvailableVirtualCores()) +
" Demand: " + rootQueue.getDemand());
} }
} }
} finally { } finally {

View File

@ -5146,4 +5146,75 @@ public void testUpdateDemand() throws IOException {
Resources.equals(aQueue.getDemand(), maxResource) && Resources.equals(aQueue.getDemand(), maxResource) &&
Resources.equals(bQueue.getDemand(), maxResource)); Resources.equals(bQueue.getDemand(), maxResource));
} }
@Test
public void testDumpState() throws IOException {
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE));
out.println("<?xml version=\"1.0\"?>");
out.println("<allocations>");
out.println("<queue name=\"parent\">");
out.println(" <queue name=\"child1\">");
out.println(" <weight>1</weight>");
out.println(" </queue>");
out.println("</queue>");
out.println("</allocations>");
out.close();
ControlledClock clock = new ControlledClock();
scheduler.setClock(clock);
scheduler.init(conf);
scheduler.start();
scheduler.reinitialize(conf, resourceManager.getRMContext());
FSLeafQueue child1 =
scheduler.getQueueManager().getLeafQueue("parent.child1", false);
Resource resource = Resource.newInstance(4 * GB, 4);
child1.setMaxShare(resource);
FSAppAttempt app = mock(FSAppAttempt.class);
Mockito.when(app.getDemand()).thenReturn(resource);
Mockito.when(app.getResourceUsage()).thenReturn(resource);
child1.addAppSchedulable(app);
child1.updateDemand();
String childQueueString = "{Name: root.parent.child1,"
+ " Weight: <memory weight=1.0, cpu weight=1.0>,"
+ " Policy: fair,"
+ " FairShare: <memory:0, vCores:0>,"
+ " SteadyFairShare: <memory:0, vCores:0>,"
+ " MaxShare: <memory:4096, vCores:4>,"
+ " MinShare: <memory:0, vCores:0>,"
+ " ResourceUsage: <memory:4096, vCores:4>,"
+ " Demand: <memory:4096, vCores:4>,"
+ " Runnable: 1,"
+ " NumPendingApps: 0,"
+ " NonRunnable: 0,"
+ " MaxAMShare: 0.5,"
+ " MaxAMResource: <memory:0, vCores:0>,"
+ " AMResourceUsage: <memory:0, vCores:0>,"
+ " LastTimeAtMinShare: " + clock.getTime()
+ "}";
assertTrue(child1.dumpState().equals(childQueueString));
FSParentQueue parent =
scheduler.getQueueManager().getParentQueue("parent", false);
parent.setMaxShare(resource);
String parentQueueString = "{Name: root.parent,"
+ " Weight: <memory weight=1.0, cpu weight=1.0>,"
+ " Policy: fair,"
+ " FairShare: <memory:0, vCores:0>,"
+ " SteadyFairShare: <memory:0, vCores:0>,"
+ " MaxShare: <memory:4096, vCores:4>,"
+ " MinShare: <memory:0, vCores:0>,"
+ " ResourceUsage: <memory:4096, vCores:4>,"
+ " Demand: <memory:0, vCores:0>,"
+ " MaxAMShare: 0.5,"
+ " Runnable: 0}";
assertTrue(parent.dumpState().equals(
parentQueueString + ", " + childQueueString));
}
} }