From 5dadf963d3639cc6d37902d9c7beaacdafac0e9c Mon Sep 17 00:00:00 2001 From: bibinchundatt Date: Tue, 28 Jul 2020 11:55:47 +0530 Subject: [PATCH] YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah. --- .../scheduler/capacity/CapacityScheduler.java | 14 ++++++++++++++ .../capacity/CapacitySchedulerMetrics.java | 12 ++++++++++++ .../TestCapacitySchedulerMetrics.java | 6 ++++++ 3 files changed, 32 insertions(+) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index bd2acd7611..699c831096 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -1828,6 +1828,7 @@ public void handle(SchedulerEvent event) { case NODE_UPDATE: { NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event; + updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent); nodeUpdate(nodeUpdatedEvent.getRMNode()); } break; @@ -2114,6 +2115,19 @@ private void removeNode(RMNode nodeInfo) { } } + private void updateSchedulerNodeHBIntervalMetrics( + NodeUpdateSchedulerEvent nodeUpdatedEvent) { + // Add metrics for evaluating the time difference between heartbeats. + SchedulerNode node = + nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID()); + if (node != null) { + long lastInterval = + Time.monotonicNow() - node.getLastHeartbeatMonotonicTime(); + CapacitySchedulerMetrics.getMetrics() + .addSchedulerNodeHBInterval(lastInterval); + } + } + @Override protected void completedContainerInternal( RMContainer rmContainer, ContainerStatus containerStatus, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerMetrics.java index 5f8988b077..be30c60cda 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerMetrics.java @@ -26,6 +26,7 @@ import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableQuantiles; import org.apache.hadoop.metrics2.lib.MutableRate; import java.util.concurrent.atomic.AtomicBoolean; @@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics { @Metric("Scheduler commit success") MutableRate commitSuccess; @Metric("Scheduler commit failure") MutableRate commitFailure; @Metric("Scheduler node update") MutableRate nodeUpdate; + @Metric("Scheduler node heartbeat interval") MutableQuantiles + schedulerNodeHBInterval; private static volatile CapacitySchedulerMetrics INSTANCE = null; private static MetricsRegistry registry; @@ -116,4 +119,13 @@ public long getNumOfAllocates() { public long getNumOfCommitSuccess() { return this.commitSuccess.lastStat().numSamples(); } + + public void addSchedulerNodeHBInterval(long heartbeatInterval) { + schedulerNodeHBInterval.add(heartbeatInterval); + } + + @VisibleForTesting + public long getNumOfSchedulerNodeHBInterval() { + return this.schedulerNodeHBInterval.getEstimator().getCount(); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestCapacitySchedulerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestCapacitySchedulerMetrics.java index 62690e9e30..99b3983f86 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestCapacitySchedulerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestCapacitySchedulerMetrics.java @@ -71,6 +71,9 @@ public RMNodeLabelsManager createNodeLabelManager() { try { GenericTestUtils.waitFor(() -> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000); + GenericTestUtils + .waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2, + 100, 3000); } catch(TimeoutException e) { Assert.fail("CS metrics not updated on node-update events."); } @@ -101,6 +104,9 @@ public RMNodeLabelsManager createNodeLabelManager() { // Verify HB metrics updated GenericTestUtils.waitFor(() -> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000); + GenericTestUtils + .waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4, + 100, 3000); // For async mode, the number of alloc might be bigger than 1 Assert.assertTrue(csMetrics.getNumOfAllocates() > 0); // But there will be only 2 successful commit (1 AM + 1 task)