YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah.
This commit is contained in:
parent
026dce5334
commit
5dadf963d3
@ -1828,6 +1828,7 @@ public void handle(SchedulerEvent event) {
|
|||||||
case NODE_UPDATE:
|
case NODE_UPDATE:
|
||||||
{
|
{
|
||||||
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
|
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
|
||||||
|
updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent);
|
||||||
nodeUpdate(nodeUpdatedEvent.getRMNode());
|
nodeUpdate(nodeUpdatedEvent.getRMNode());
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -2114,6 +2115,19 @@ private void removeNode(RMNode nodeInfo) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void updateSchedulerNodeHBIntervalMetrics(
|
||||||
|
NodeUpdateSchedulerEvent nodeUpdatedEvent) {
|
||||||
|
// Add metrics for evaluating the time difference between heartbeats.
|
||||||
|
SchedulerNode node =
|
||||||
|
nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID());
|
||||||
|
if (node != null) {
|
||||||
|
long lastInterval =
|
||||||
|
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
|
||||||
|
CapacitySchedulerMetrics.getMetrics()
|
||||||
|
.addSchedulerNodeHBInterval(lastInterval);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void completedContainerInternal(
|
protected void completedContainerInternal(
|
||||||
RMContainer rmContainer, ContainerStatus containerStatus,
|
RMContainer rmContainer, ContainerStatus containerStatus,
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
import org.apache.hadoop.metrics2.annotation.Metrics;
|
import org.apache.hadoop.metrics2.annotation.Metrics;
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
|
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
|
||||||
|
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
|
||||||
import org.apache.hadoop.metrics2.lib.MutableRate;
|
import org.apache.hadoop.metrics2.lib.MutableRate;
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics {
|
|||||||
@Metric("Scheduler commit success") MutableRate commitSuccess;
|
@Metric("Scheduler commit success") MutableRate commitSuccess;
|
||||||
@Metric("Scheduler commit failure") MutableRate commitFailure;
|
@Metric("Scheduler commit failure") MutableRate commitFailure;
|
||||||
@Metric("Scheduler node update") MutableRate nodeUpdate;
|
@Metric("Scheduler node update") MutableRate nodeUpdate;
|
||||||
|
@Metric("Scheduler node heartbeat interval") MutableQuantiles
|
||||||
|
schedulerNodeHBInterval;
|
||||||
|
|
||||||
private static volatile CapacitySchedulerMetrics INSTANCE = null;
|
private static volatile CapacitySchedulerMetrics INSTANCE = null;
|
||||||
private static MetricsRegistry registry;
|
private static MetricsRegistry registry;
|
||||||
@ -116,4 +119,13 @@ public long getNumOfAllocates() {
|
|||||||
public long getNumOfCommitSuccess() {
|
public long getNumOfCommitSuccess() {
|
||||||
return this.commitSuccess.lastStat().numSamples();
|
return this.commitSuccess.lastStat().numSamples();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addSchedulerNodeHBInterval(long heartbeatInterval) {
|
||||||
|
schedulerNodeHBInterval.add(heartbeatInterval);
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public long getNumOfSchedulerNodeHBInterval() {
|
||||||
|
return this.schedulerNodeHBInterval.getEstimator().getCount();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -71,6 +71,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
|
|||||||
try {
|
try {
|
||||||
GenericTestUtils.waitFor(()
|
GenericTestUtils.waitFor(()
|
||||||
-> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
|
-> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
|
||||||
|
GenericTestUtils
|
||||||
|
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2,
|
||||||
|
100, 3000);
|
||||||
} catch(TimeoutException e) {
|
} catch(TimeoutException e) {
|
||||||
Assert.fail("CS metrics not updated on node-update events.");
|
Assert.fail("CS metrics not updated on node-update events.");
|
||||||
}
|
}
|
||||||
@ -101,6 +104,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
|
|||||||
// Verify HB metrics updated
|
// Verify HB metrics updated
|
||||||
GenericTestUtils.waitFor(()
|
GenericTestUtils.waitFor(()
|
||||||
-> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
|
-> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
|
||||||
|
GenericTestUtils
|
||||||
|
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4,
|
||||||
|
100, 3000);
|
||||||
// For async mode, the number of alloc might be bigger than 1
|
// For async mode, the number of alloc might be bigger than 1
|
||||||
Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
|
Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
|
||||||
// But there will be only 2 successful commit (1 AM + 1 task)
|
// But there will be only 2 successful commit (1 AM + 1 task)
|
||||||
|
Loading…
Reference in New Issue
Block a user