YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah.

This commit is contained in:
bibinchundatt 2020-07-28 11:55:47 +05:30
parent 026dce5334
commit 5dadf963d3
3 changed files with 32 additions and 0 deletions

View File

@ -1828,6 +1828,7 @@ public void handle(SchedulerEvent event) {
case NODE_UPDATE: case NODE_UPDATE:
{ {
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event; NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent);
nodeUpdate(nodeUpdatedEvent.getRMNode()); nodeUpdate(nodeUpdatedEvent.getRMNode());
} }
break; break;
@ -2114,6 +2115,19 @@ private void removeNode(RMNode nodeInfo) {
} }
} }
private void updateSchedulerNodeHBIntervalMetrics(
NodeUpdateSchedulerEvent nodeUpdatedEvent) {
// Add metrics for evaluating the time difference between heartbeats.
SchedulerNode node =
nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID());
if (node != null) {
long lastInterval =
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
CapacitySchedulerMetrics.getMetrics()
.addSchedulerNodeHBInterval(lastInterval);
}
}
@Override @Override
protected void completedContainerInternal( protected void completedContainerInternal(
RMContainer rmContainer, ContainerStatus containerStatus, RMContainer rmContainer, ContainerStatus containerStatus,

View File

@ -26,6 +26,7 @@
import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.metrics2.lib.MutableRate;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics {
@Metric("Scheduler commit success") MutableRate commitSuccess; @Metric("Scheduler commit success") MutableRate commitSuccess;
@Metric("Scheduler commit failure") MutableRate commitFailure; @Metric("Scheduler commit failure") MutableRate commitFailure;
@Metric("Scheduler node update") MutableRate nodeUpdate; @Metric("Scheduler node update") MutableRate nodeUpdate;
@Metric("Scheduler node heartbeat interval") MutableQuantiles
schedulerNodeHBInterval;
private static volatile CapacitySchedulerMetrics INSTANCE = null; private static volatile CapacitySchedulerMetrics INSTANCE = null;
private static MetricsRegistry registry; private static MetricsRegistry registry;
@ -116,4 +119,13 @@ public long getNumOfAllocates() {
public long getNumOfCommitSuccess() { public long getNumOfCommitSuccess() {
return this.commitSuccess.lastStat().numSamples(); return this.commitSuccess.lastStat().numSamples();
} }
public void addSchedulerNodeHBInterval(long heartbeatInterval) {
schedulerNodeHBInterval.add(heartbeatInterval);
}
@VisibleForTesting
public long getNumOfSchedulerNodeHBInterval() {
return this.schedulerNodeHBInterval.getEstimator().getCount();
}
} }

View File

@ -71,6 +71,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
try { try {
GenericTestUtils.waitFor(() GenericTestUtils.waitFor(()
-> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000); -> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
GenericTestUtils
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2,
100, 3000);
} catch(TimeoutException e) { } catch(TimeoutException e) {
Assert.fail("CS metrics not updated on node-update events."); Assert.fail("CS metrics not updated on node-update events.");
} }
@ -101,6 +104,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
// Verify HB metrics updated // Verify HB metrics updated
GenericTestUtils.waitFor(() GenericTestUtils.waitFor(()
-> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000); -> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
GenericTestUtils
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4,
100, 3000);
// For async mode, the number of alloc might be bigger than 1 // For async mode, the number of alloc might be bigger than 1
Assert.assertTrue(csMetrics.getNumOfAllocates() > 0); Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
// But there will be only 2 successful commit (1 AM + 1 task) // But there will be only 2 successful commit (1 AM + 1 task)