YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah.

This commit is contained in:
bibinchundatt 2020-07-28 11:55:47 +05:30
parent 026dce5334
commit 5dadf963d3
3 changed files with 32 additions and 0 deletions

View File

@ -1828,6 +1828,7 @@ public void handle(SchedulerEvent event) {
case NODE_UPDATE:
{
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent);
nodeUpdate(nodeUpdatedEvent.getRMNode());
}
break;
@ -2114,6 +2115,19 @@ private void removeNode(RMNode nodeInfo) {
}
}
private void updateSchedulerNodeHBIntervalMetrics(
NodeUpdateSchedulerEvent nodeUpdatedEvent) {
// Add metrics for evaluating the time difference between heartbeats.
SchedulerNode node =
nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID());
if (node != null) {
long lastInterval =
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
CapacitySchedulerMetrics.getMetrics()
.addSchedulerNodeHBInterval(lastInterval);
}
}
@Override
protected void completedContainerInternal(
RMContainer rmContainer, ContainerStatus containerStatus,

View File

@ -26,6 +26,7 @@
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate;
import java.util.concurrent.atomic.AtomicBoolean;
@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics {
@Metric("Scheduler commit success") MutableRate commitSuccess;
@Metric("Scheduler commit failure") MutableRate commitFailure;
@Metric("Scheduler node update") MutableRate nodeUpdate;
@Metric("Scheduler node heartbeat interval") MutableQuantiles
schedulerNodeHBInterval;
private static volatile CapacitySchedulerMetrics INSTANCE = null;
private static MetricsRegistry registry;
@ -116,4 +119,13 @@ public long getNumOfAllocates() {
public long getNumOfCommitSuccess() {
return this.commitSuccess.lastStat().numSamples();
}
public void addSchedulerNodeHBInterval(long heartbeatInterval) {
schedulerNodeHBInterval.add(heartbeatInterval);
}
@VisibleForTesting
public long getNumOfSchedulerNodeHBInterval() {
return this.schedulerNodeHBInterval.getEstimator().getCount();
}
}

View File

@ -71,6 +71,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
try {
GenericTestUtils.waitFor(()
-> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
GenericTestUtils
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2,
100, 3000);
} catch(TimeoutException e) {
Assert.fail("CS metrics not updated on node-update events.");
}
@ -101,6 +104,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
// Verify HB metrics updated
GenericTestUtils.waitFor(()
-> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
GenericTestUtils
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4,
100, 3000);
// For async mode, the number of alloc might be bigger than 1
Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
// But there will be only 2 successful commit (1 AM + 1 task)