From 4db9cc70d0178703fb28f451eb84d97f2bf63af8 Mon Sep 17 00:00:00 2001 From: Ray Chiang Date: Fri, 10 Mar 2017 16:13:31 -0800 Subject: [PATCH] YARN-6042. Dump scheduler and queue state information into FairScheduler DEBUG log. (Yufei Gu via rchiang) --- .../src/main/conf/log4j.properties | 9 +++ .../scheduler/fair/FSAppAttempt.java | 49 +++++++------ .../scheduler/fair/FSLeafQueue.java | 21 ++++++ .../scheduler/fair/FSParentQueue.java | 21 ++++++ .../scheduler/fair/FSQueue.java | 41 ++++++++++- .../scheduler/fair/FairScheduler.java | 28 +++++--- .../scheduler/fair/TestFairScheduler.java | 71 +++++++++++++++++++ 7 files changed, 206 insertions(+), 34 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties b/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties index b8c84e7b30..60267635df 100644 --- a/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties +++ b/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties @@ -321,3 +321,12 @@ log4j.appender.EWMA=org.apache.hadoop.yarn.util.Log4jWarningErrorMetricsAppender log4j.appender.EWMA.cleanupInterval=${yarn.ewma.cleanupInterval} log4j.appender.EWMA.messageAgeLimitSeconds=${yarn.ewma.messageAgeLimitSeconds} log4j.appender.EWMA.maxUniqueMessages=${yarn.ewma.maxUniqueMessages} + +# Fair scheduler requests log on state dump +log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.statedump=DEBUG,FSLOGGER +log4j.appender.FSLOGGER=org.apache.log4j.RollingFileAppender +log4j.appender.FSLOGGER.File=${hadoop.log.dir}/fairscheduler-statedump.log +log4j.appender.FSLOGGER.layout=org.apache.log4j.PatternLayout +log4j.appender.FSLOGGER.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +log4j.appender.FSLOGGER.MaxFileSize=${hadoop.log.maxfilesize} +log4j.appender.FSLOGGER.MaxBackupIndex=${hadoop.log.maxbackupindex} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index 3a9c94e656..ccfcffb83d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -835,25 +835,27 @@ private Resource assignContainer( return capability; } + if (LOG.isDebugEnabled()) { + LOG.debug("Resource request: " + capability + " exceeds the available" + + " resources of the node."); + } + // The desired container won't fit here, so reserve if (isReservable(capability) && reserve(pendingAsk.getPerAllocationResource(), node, reservedContainer, type, schedulerKey)) { - if (isWaitingForAMContainer()) { - updateAMDiagnosticMsg(capability, - " exceed the available resources of the node and the request is" - + " reserved"); + updateAMDiagnosticMsg(capability, " exceeds the available resources of " + + "the node and the request is reserved)"); + if (LOG.isDebugEnabled()) { + LOG.debug(getName() + "'s resource request is reserved."); } return FairScheduler.CONTAINER_RESERVED; } else { - if (isWaitingForAMContainer()) { - updateAMDiagnosticMsg(capability, - " exceed the available resources of the node and the request cannot" - + " be reserved"); - } + updateAMDiagnosticMsg(capability, " exceeds the available resources of " + + "the node and the request cannot be reserved)"); if (LOG.isDebugEnabled()) { - LOG.debug("Couldn't creating reservation for " + - getName() + ",at priority " + schedulerKey.getPriority()); + LOG.debug("Couldn't create reservation for app: " + getName() + + ", at priority " + schedulerKey.getPriority()); } return Resources.none(); } @@ -1034,10 +1036,9 @@ private boolean hasContainerForNode(SchedulerRequestKey key, ret = false; } else if (!getQueue().fitsInMaxShare(resource)) { // The requested container must fit in queue maximum share - if (isWaitingForAMContainer()) { - updateAMDiagnosticMsg(resource, - " exceeds current queue or its parents maximum resource allowed)."); - } + updateAMDiagnosticMsg(resource, + " exceeds current queue or its parents maximum resource allowed)."); + ret = false; } @@ -1309,15 +1310,13 @@ public void updateDemand() { @Override public Resource assignContainer(FSSchedulerNode node) { if (isOverAMShareLimit()) { - if (isWaitingForAMContainer()) { - PendingAsk amAsk = appSchedulingInfo.getNextPendingAsk(); - updateAMDiagnosticMsg(amAsk.getPerAllocationResource(), - " exceeds maximum AM resource allowed)."); - } - + PendingAsk amAsk = appSchedulingInfo.getNextPendingAsk(); + updateAMDiagnosticMsg(amAsk.getPerAllocationResource(), + " exceeds maximum AM resource allowed)."); if (LOG.isDebugEnabled()) { - LOG.debug("Skipping allocation because maxAMShare limit would " + - "be exceeded"); + LOG.debug("AM resource request: " + amAsk.getPerAllocationResource() + + " exceeds maximum AM resource allowed, " + + getQueue().dumpState()); } return Resources.none(); } @@ -1331,6 +1330,10 @@ public Resource assignContainer(FSSchedulerNode node) { * @param reason the reason why AM doesn't get the resource */ private void updateAMDiagnosticMsg(Resource resource, String reason) { + if (!isWaitingForAMContainer()) { + return; + } + StringBuilder diagnosticMessageBldr = new StringBuilder(); diagnosticMessageBldr.append(" (Resource request: "); diagnosticMessageBldr.append(resource); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java index 5be4b361af..10f1e287f4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java @@ -617,4 +617,25 @@ private boolean isStarvedForFairShare() { boolean isStarved() { return isStarvedForMinShare() || isStarvedForFairShare(); } + + @Override + protected void dumpStateInternal(StringBuilder sb) { + sb.append("{Name: " + getName() + + ", Weight: " + weights + + ", Policy: " + policy.getName() + + ", FairShare: " + getFairShare() + + ", SteadyFairShare: " + getSteadyFairShare() + + ", MaxShare: " + maxShare + + ", MinShare: " + minShare + + ", ResourceUsage: " + getResourceUsage() + + ", Demand: " + getDemand() + + ", Runnable: " + getNumRunnableApps() + + ", NumPendingApps: " + getNumPendingApps() + + ", NonRunnable: " + getNumNonRunnableApps() + + ", MaxAMShare: " + maxAMShare + + ", MaxAMResource: " + computeMaxAMResource() + + ", AMResourceUsage: " + getAmResourceUsage() + + ", LastTimeAtMinShare: " + lastTimeAtMinShare + + "}"); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java index 1c8e9ced59..b062c586d0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java @@ -292,4 +292,25 @@ public void recoverContainer(Resource clusterResource, // TODO Auto-generated method stub } + + @Override + protected void dumpStateInternal(StringBuilder sb) { + sb.append("{Name: " + getName() + + ", Weight: " + weights + + ", Policy: " + policy.getName() + + ", FairShare: " + getFairShare() + + ", SteadyFairShare: " + getSteadyFairShare() + + ", MaxShare: " + maxShare + + ", MinShare: " + minShare + + ", ResourceUsage: " + getResourceUsage() + + ", Demand: " + getDemand() + + ", MaxAMShare: " + maxAMShare + + ", Runnable: " + getNumRunnableApps() + + "}"); + + for(FSQueue child : getChildQueues()) { + sb.append(", "); + child.dumpStateInternal(sb); + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java index b5592c55bc..acf4d5c203 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java @@ -393,11 +393,22 @@ public abstract void collectSchedulerApplications( * @return true if check passes (can assign) or false otherwise */ boolean assignContainerPreCheck(FSSchedulerNode node) { - if (!Resources.fitsIn(getResourceUsage(), maxShare) - || node.getReservedContainer() != null) { + if (node.getReservedContainer() != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("Assigning container failed on node '" + node.getNodeName() + + " because it has reserved containers."); + } return false; + } else if (!Resources.fitsIn(getResourceUsage(), maxShare)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Assigning container failed on node '" + node.getNodeName() + + " because queue resource usage is larger than MaxShare: " + + dumpState()); + } + return false; + } else { + return true; } - return true; } /** @@ -453,6 +464,11 @@ boolean fitsInMaxShare(Resource additionalResource) { Resources.add(getResourceUsage(), additionalResource); if (!Resources.fitsIn(usagePlusAddition, getMaxShare())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Resource usage plus resource request: " + usagePlusAddition + + " exceeds maximum resource allowed:" + getMaxShare() + + " in queue " + getName()); + } return false; } @@ -491,4 +507,23 @@ public boolean verifyAndSetPolicyFromConf(AllocationConfiguration queueConf) { setPolicy(queuePolicy); return true; } + + /** + * Recursively dump states of all queues. + * + * @return a string which holds all queue states + */ + public String dumpState() { + StringBuilder sb = new StringBuilder(); + dumpStateInternal(sb); + return sb.toString(); + } + + + /** + * Recursively dump states of all queues. + * + * @param sb the {code StringBuilder} which holds queue states + */ + protected abstract void dumpStateInternal(StringBuilder sb); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 4f3e4f9140..f3fde76f06 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -139,7 +139,9 @@ public class FairScheduler extends private boolean usePortForNodeName; private static final Log LOG = LogFactory.getLog(FairScheduler.class); - + private static final Log STATE_DUMP_LOG = + LogFactory.getLog(FairScheduler.class.getName() + ".statedump"); + private static final ResourceCalculator RESOURCE_CALCULATOR = new DefaultResourceCalculator(); private static final ResourceCalculator DOMINANT_RESOURCE_CALCULATOR = @@ -151,7 +153,7 @@ public class FairScheduler extends // How often fair shares are re-calculated (ms) protected long updateInterval; - private final int UPDATE_DEBUG_FREQUENCY = 5; + private final int UPDATE_DEBUG_FREQUENCY = 25; private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY; @VisibleForTesting @@ -344,6 +346,21 @@ public void run() { } } + /** + * Dump scheduler state including states of all queues. + */ + private void dumpSchedulerState() { + FSQueue rootQueue = queueMgr.getRootQueue(); + Resource clusterResource = getClusterResource(); + LOG.debug("FairScheduler state: Cluster Capacity: " + clusterResource + + " Allocations: " + rootMetrics.getAllocatedResources() + + " Availability: " + Resource.newInstance( + rootMetrics.getAvailableMB(), rootMetrics.getAvailableVirtualCores()) + + " Demand: " + rootQueue.getDemand()); + + STATE_DUMP_LOG.debug(rootQueue.dumpState()); + } + /** * Recompute the internal variables used by the scheduler - per-job weights, * fair shares, deficits, minimum slot allocations, and amount of used and @@ -368,12 +385,7 @@ public void update() { if (LOG.isDebugEnabled()) { if (--updatesToSkipForDebug < 0) { updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY; - LOG.debug("Cluster Capacity: " + clusterResource + - " Allocations: " + rootMetrics.getAllocatedResources() + - " Availability: " + Resource.newInstance( - rootMetrics.getAvailableMB(), - rootMetrics.getAvailableVirtualCores()) + - " Demand: " + rootQueue.getDemand()); + dumpSchedulerState(); } } } finally { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index cdf0d81fe2..537d3d0d39 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -5146,4 +5146,75 @@ public void testUpdateDemand() throws IOException { Resources.equals(aQueue.getDemand(), maxResource) && Resources.equals(bQueue.getDemand(), maxResource)); } + + @Test + public void testDumpState() throws IOException { + conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE); + + PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE)); + out.println(""); + out.println(""); + out.println(""); + out.println(" "); + out.println(" 1"); + out.println(" "); + out.println(""); + out.println(""); + out.close(); + + ControlledClock clock = new ControlledClock(); + scheduler.setClock(clock); + + scheduler.init(conf); + scheduler.start(); + scheduler.reinitialize(conf, resourceManager.getRMContext()); + + FSLeafQueue child1 = + scheduler.getQueueManager().getLeafQueue("parent.child1", false); + Resource resource = Resource.newInstance(4 * GB, 4); + child1.setMaxShare(resource); + FSAppAttempt app = mock(FSAppAttempt.class); + Mockito.when(app.getDemand()).thenReturn(resource); + Mockito.when(app.getResourceUsage()).thenReturn(resource); + child1.addAppSchedulable(app); + child1.updateDemand(); + + String childQueueString = "{Name: root.parent.child1," + + " Weight: ," + + " Policy: fair," + + " FairShare: ," + + " SteadyFairShare: ," + + " MaxShare: ," + + " MinShare: ," + + " ResourceUsage: ," + + " Demand: ," + + " Runnable: 1," + + " NumPendingApps: 0," + + " NonRunnable: 0," + + " MaxAMShare: 0.5," + + " MaxAMResource: ," + + " AMResourceUsage: ," + + " LastTimeAtMinShare: " + clock.getTime() + + "}"; + + assertTrue(child1.dumpState().equals(childQueueString)); + FSParentQueue parent = + scheduler.getQueueManager().getParentQueue("parent", false); + parent.setMaxShare(resource); + + String parentQueueString = "{Name: root.parent," + + " Weight: ," + + " Policy: fair," + + " FairShare: ," + + " SteadyFairShare: ," + + " MaxShare: ," + + " MinShare: ," + + " ResourceUsage: ," + + " Demand: ," + + " MaxAMShare: 0.5," + + " Runnable: 0}"; + + assertTrue(parent.dumpState().equals( + parentQueueString + ", " + childQueueString)); + } }