YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah.
This commit is contained in:
parent
026dce5334
commit
5dadf963d3
@ -1828,6 +1828,7 @@ public void handle(SchedulerEvent event) {
|
||||
case NODE_UPDATE:
|
||||
{
|
||||
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
|
||||
updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent);
|
||||
nodeUpdate(nodeUpdatedEvent.getRMNode());
|
||||
}
|
||||
break;
|
||||
@ -2114,6 +2115,19 @@ private void removeNode(RMNode nodeInfo) {
|
||||
}
|
||||
}
|
||||
|
||||
private void updateSchedulerNodeHBIntervalMetrics(
|
||||
NodeUpdateSchedulerEvent nodeUpdatedEvent) {
|
||||
// Add metrics for evaluating the time difference between heartbeats.
|
||||
SchedulerNode node =
|
||||
nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID());
|
||||
if (node != null) {
|
||||
long lastInterval =
|
||||
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
|
||||
CapacitySchedulerMetrics.getMetrics()
|
||||
.addSchedulerNodeHBInterval(lastInterval);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void completedContainerInternal(
|
||||
RMContainer rmContainer, ContainerStatus containerStatus,
|
||||
|
@ -26,6 +26,7 @@
|
||||
import org.apache.hadoop.metrics2.annotation.Metrics;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
|
||||
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRate;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics {
|
||||
@Metric("Scheduler commit success") MutableRate commitSuccess;
|
||||
@Metric("Scheduler commit failure") MutableRate commitFailure;
|
||||
@Metric("Scheduler node update") MutableRate nodeUpdate;
|
||||
@Metric("Scheduler node heartbeat interval") MutableQuantiles
|
||||
schedulerNodeHBInterval;
|
||||
|
||||
private static volatile CapacitySchedulerMetrics INSTANCE = null;
|
||||
private static MetricsRegistry registry;
|
||||
@ -116,4 +119,13 @@ public long getNumOfAllocates() {
|
||||
public long getNumOfCommitSuccess() {
|
||||
return this.commitSuccess.lastStat().numSamples();
|
||||
}
|
||||
|
||||
public void addSchedulerNodeHBInterval(long heartbeatInterval) {
|
||||
schedulerNodeHBInterval.add(heartbeatInterval);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public long getNumOfSchedulerNodeHBInterval() {
|
||||
return this.schedulerNodeHBInterval.getEstimator().getCount();
|
||||
}
|
||||
}
|
||||
|
@ -71,6 +71,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
|
||||
try {
|
||||
GenericTestUtils.waitFor(()
|
||||
-> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
|
||||
GenericTestUtils
|
||||
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2,
|
||||
100, 3000);
|
||||
} catch(TimeoutException e) {
|
||||
Assert.fail("CS metrics not updated on node-update events.");
|
||||
}
|
||||
@ -101,6 +104,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
|
||||
// Verify HB metrics updated
|
||||
GenericTestUtils.waitFor(()
|
||||
-> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
|
||||
GenericTestUtils
|
||||
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4,
|
||||
100, 3000);
|
||||
// For async mode, the number of alloc might be bigger than 1
|
||||
Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
|
||||
// But there will be only 2 successful commit (1 AM + 1 task)
|
||||
|
Loading…
Reference in New Issue
Block a user