YARN-2352. FairScheduler: Collect metrics on duration of critical methods that affect performance. (kasha)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1616769 13f79535-47bb-0310-9956-ffa450edef68
2014-08-08 14:17:54 +00:00 · 2014-08-08 14:17:54 +00:00 · 14864e9c7c
commit 14864e9c7c
parent d3a2fe2807
6 changed files with 64 additions and 8 deletions
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsCollectorImpl.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsCollectorImpl.java
@ -21,14 +21,18 @@
 import java.util.Iterator;
 import java.util.List;

+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;

+import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.metrics2.MetricsInfo;
 import org.apache.hadoop.metrics2.MetricsCollector;
 import org.apache.hadoop.metrics2.MetricsFilter;
 import static org.apache.hadoop.metrics2.lib.Interns.*;

-class MetricsCollectorImpl implements MetricsCollector,
+@InterfaceAudience.Private
+@VisibleForTesting
+public class MetricsCollectorImpl implements MetricsCollector,
    Iterable<MetricsRecordBuilderImpl> {

  private final List<MetricsRecordBuilderImpl> rbs = Lists.newArrayList();
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableStat.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableStat.java
@ -89,6 +89,14 @@ public MutableStat(String name, String description,
    this(name, description, sampleName, valueName, false);
  }

+  /**
+   * Set whether to display the extended stats (stdev, min/max etc.) or not
+   * @param extended enable/disable displaying extended stats
+   */
+  public synchronized void setExtended(boolean extended) {
+    this.extended = extended;
+  }
+
  /**
   * Add a number of samples and their sum to the running stat
   * @param numSamples  number of samples
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@ -94,6 +94,9 @@ Release 2.6.0 - UNRELEASED
    YARN-2288. Made persisted data in LevelDB timeline store be versioned. (Junping Du
    via zjshen)

+    YARN-2352. FairScheduler: Collect metrics on duration of critical methods that 
+    affect performance. (kasha)
+
  OPTIMIZATIONS

  BUG FIXES
--- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml
+++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml
@ -200,6 +200,13 @@
    <Field name="updateInterval" />
    <Bug pattern="IS2_INCONSISTENT_SYNC" />
  </Match>
+  <!-- Inconsistent sync warning - callDurationMetrics is only initialized once and never changed -->
+  <Match>
+    <Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler" />
+    <Field name="fsOpDurations" />
+    <Bug pattern="IS2_INCONSISTENT_SYNC" />
+  </Match>
+
  <!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
  <Match>
    <Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
@ -149,6 +149,7 @@ public class FairScheduler extends

  // Aggregate metrics
  FSQueueMetrics rootMetrics;
+  FSOpDurations fsOpDurations;

  // Time when we last updated preemption vars
  protected long lastPreemptionUpdateTime;
@ -256,8 +257,11 @@ public void run() {
      while (!Thread.currentThread().isInterrupted()) {
        try {
          Thread.sleep(updateInterval);
+          long start = getClock().getTime();
          update();
          preemptTasksIfNecessary();
+          long duration = getClock().getTime() - start;
+          fsOpDurations.addUpdateThreadRunDuration(duration);
        } catch (InterruptedException ie) {
          LOG.warn("Update thread interrupted. Exiting.");
          return;
@ -294,6 +298,7 @@ public void run() {
   * required resources per job.
   */
  protected synchronized void update() {
+    long start = getClock().getTime();
    updatePreemptionVariables(); // Determine if any queues merit preemption

    FSQueue rootQueue = queueMgr.getRootQueue();
@ -317,6 +322,9 @@ protected synchronized void update() {
            "  Demand: " + rootQueue.getDemand());
      }
    }
+
+    long duration = getClock().getTime() - start;
+    fsOpDurations.addUpdateCallDuration(duration);
  }

  /**
@ -325,7 +333,7 @@ protected synchronized void update() {
   * for each type of task.
   */
  private void updatePreemptionVariables() {
-    long now = clock.getTime();
+    long now = getClock().getTime();
    lastPreemptionUpdateTime = now;
    for (FSLeafQueue sched : queueMgr.getLeafQueues()) {
      if (!isStarvedForMinShare(sched)) {
@ -352,7 +360,8 @@ boolean isStarvedForMinShare(FSLeafQueue sched) {
   * defined as being below half its fair share.
   */
  boolean isStarvedForFairShare(FSLeafQueue sched) {
-    Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR, clusterResource,
+    Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR,
+        clusterResource,
        Resources.multiply(sched.getFairShare(), .5), sched.getDemand());
    return Resources.lessThan(RESOURCE_CALCULATOR, clusterResource,
        sched.getResourceUsage(), desiredFairShare);
@ -370,7 +379,7 @@ protected synchronized void preemptTasksIfNecessary() {
      return;
    }

-    long curTime = clock.getTime();
+    long curTime = getClock().getTime();
    if (curTime - lastPreemptCheckTime < preemptionInterval) {
      return;
    }
@ -398,6 +407,7 @@ protected synchronized void preemptTasksIfNecessary() {
   * We make sure that no queue is placed below its fair share in the process.
   */
  protected void preemptResources(Resource toPreempt) {
+    long start = getClock().getTime();
    if (Resources.equals(toPreempt, Resources.none())) {
      return;
    }
@ -448,6 +458,9 @@ protected void preemptResources(Resource toPreempt) {
        }
      }
    }
+
+    long duration = getClock().getTime() - start;
+    fsOpDurations.addPreemptCallDuration(duration);
  }
  
  protected void warnOrKillContainer(RMContainer container) {
@ -463,7 +476,7 @@ protected void warnOrKillContainer(RMContainer container) {
    if (time != null) {
      // if we asked for preemption more than maxWaitTimeBeforeKill ms ago,
      // proceed with kill
-      if (time + waitTimeBeforeKill < clock.getTime()) {
+      if (time + waitTimeBeforeKill < getClock().getTime()) {
        ContainerStatus status =
          SchedulerUtils.createPreemptedContainerStatus(
            container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER);
@ -474,11 +487,11 @@ protected void warnOrKillContainer(RMContainer container) {
        completedContainer(container, status, RMContainerEventType.KILL);
        LOG.info("Killing container" + container +
            " (after waiting for premption for " +
-            (clock.getTime() - time) + "ms)");
+            (getClock().getTime() - time) + "ms)");
      }
    } else {
      // track the request in the FSSchedulerApp itself
-      app.addPreemption(container, clock.getTime());
+      app.addPreemption(container, getClock().getTime());
    }
  }

@ -659,7 +672,7 @@ queue, new ActiveUsersManager(getRootQueueMetrics()),
            rmContext);
    if (transferStateFromPreviousAttempt) {
      attempt.transferStateFromPreviousAttempt(application
-        .getCurrentAppAttempt());
+          .getCurrentAppAttempt());
    }
    application.setCurrentAppAttempt(attempt);

@ -960,6 +973,7 @@ public Allocation allocate(ApplicationAttemptId appAttemptId,
   * Process a heartbeat update from a node.
   */
  private synchronized void nodeUpdate(RMNode nm) {
+    long start = getClock().getTime();
    if (LOG.isDebugEnabled()) {
      LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
    }
@ -996,9 +1010,13 @@ private synchronized void nodeUpdate(RMNode nm) {
    } else {
      attemptScheduling(node);
    }
+
+    long duration = getClock().getTime() - start;
+    fsOpDurations.addNodeUpdateDuration(duration);
  }

  void continuousSchedulingAttempt() throws InterruptedException {
+    long start = getClock().getTime();
    List<NodeId> nodeIdList = new ArrayList<NodeId>(nodes.keySet());
    // Sort the nodes by space available on them, so that we offer
    // containers on emptier nodes first, facilitating an even spread. This
@ -1021,6 +1039,9 @@ void continuousSchedulingAttempt() throws InterruptedException {
            ": " + ex.toString(), ex);
      }
    }
+
+    long duration = getClock().getTime() - start;
+    fsOpDurations.addContinuousSchedulingRunDuration(duration);
  }

  /** Sort nodes by available resource */
@ -1244,6 +1265,8 @@ private synchronized void initScheduler(Configuration conf)
    }

    rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
+    fsOpDurations = FSOpDurations.getInstance(true);
+
    // This stores per-application scheduling information
    this.applications =
        new ConcurrentHashMap<ApplicationId,SchedulerApplication<FSSchedulerApp>>();
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
@ -18,6 +18,7 @@

 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;

+import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotEquals;
@ -3366,4 +3367,14 @@ public void testThreadLifeCycle() throws InterruptedException {

    assertNotEquals("One of the threads is still alive", 0, numRetries);
  }
+
+  @Test
+  public void testPerfMetricsInited() {
+    scheduler.init(conf);
+    scheduler.start();
+    MetricsCollectorImpl collector = new MetricsCollectorImpl();
+    scheduler.fsOpDurations.getMetrics(collector, true);
+    assertEquals("Incorrect number of perf metrics", 1,
+        collector.getRecords().size());
+  }
 }