YARN-10258. Add metrics for 'ApplicationsRunning' in NodeManager. Contributed by ANANDA G B.

This commit is contained in:
Peter Bacsko 2021-05-19 10:31:57 +02:00
parent 611ed04aff
commit 864b710787
5 changed files with 27 additions and 8 deletions

View File

@ -442,6 +442,7 @@ private void recoverApplication(ContainerManagerApplicationProto p)
ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc,
appId, creds, context, p.getAppLogAggregationInitedTime());
context.getApplications().put(appId, app);
metrics.runningApplication();
app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext));
}
@ -1137,6 +1138,7 @@ protected void startContainerInternal(
applicationID, credentials, context);
if (context.getApplications().putIfAbsent(applicationID,
application) == null) {
metrics.runningApplication();
LOG.info("Creating a new application reference for app "
+ applicationID);
LogAggregationContext logAggregationContext =

View File

@ -623,6 +623,9 @@ static class AppLogsAggregatedTransition implements
public void transition(ApplicationImpl app, ApplicationEvent event) {
ApplicationId appId = event.getApplicationID();
app.context.getApplications().remove(appId);
if (null != app.context.getNodeManagerMetrics()) {
app.context.getNodeManagerMetrics().endRunningApplication();
}
app.aclsManager.removeApplication(appId);
try {
app.context.getNMStateStore().removeApplication(appId);

View File

@ -100,6 +100,8 @@ public class NodeManagerMetrics {
MutableGaugeFloat nodeCpuUtilization;
@Metric("Current GPU utilization")
MutableGaugeFloat nodeGpuUtilization;
@Metric("Current running apps")
MutableGaugeInt applicationsRunning;
@Metric("Missed localization requests in bytes")
MutableCounterLong localizedCacheMissBytes;
@ -187,6 +189,14 @@ public void endReInitingContainer() {
containersReIniting.decr();
}
public void runningApplication() {
applicationsRunning.incr();
}
public void endRunningApplication() {
applicationsRunning.decr();
}
public void pausedContainer() {
containersPaused.incr();
}

View File

@ -438,7 +438,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
org.apache.hadoop.yarn.server.nodemanager
.containermanager.container.ContainerState.RUNNING);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
1, 1, 1, 9, 1, 7, 0F);
1, 1, 1, 9, 1, 7, 0F, 1);
// restart and verify metrics could be recovered
cm.stop();
@ -446,7 +446,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
metrics = NodeManagerMetrics.create();
metrics.addResource(Resource.newInstance(10240, 8));
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
0, 0, 10, 0, 8, 0F);
0, 0, 10, 0, 8, 0F, 0);
context = createContext(conf, stateStore);
cm = createContainerManager(context, delSrvc);
cm.init(conf);
@ -455,7 +455,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
app = context.getApplications().get(appId);
assertNotNull(app);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
1, 1, 1, 9, 1, 7, 0F);
1, 1, 1, 9, 1, 7, 0F, 1);
cm.stop();
}

View File

@ -103,12 +103,16 @@ public void testReferenceOfSingletonJvmMetrics() {
// Set node gpu utilization
metrics.setNodeGpuUtilization(35.5F);
// ApplicationsRunning expected to be 1
metrics.runningApplication();
metrics.runningApplication();
metrics.endRunningApplication();
// availableGB is expected to be floored,
// while allocatedGB is expected to be ceiled.
// allocatedGB: 3.75GB allocated memory is shown as 4GB
// availableGB: 4.25GB available memory is shown as 4GB
checkMetrics(10, 1, 1, 1, 1,
1, 4, 7, 4, 13, 3, 35.5F);
checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3, 35.5F, 1);
// Update resource and check available resource again
metrics.addResource(total);
@ -120,7 +124,7 @@ public void testReferenceOfSingletonJvmMetrics() {
public static void checkMetrics(int launched, int completed, int failed,
int killed, int initing, int running, int allocatedGB,
int allocatedContainers, int availableGB, int allocatedVCores,
int availableVCores, Float nodeGpuUtilization) {
int availableVCores, Float nodeGpuUtilization, int applicationsRunning) {
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
assertCounter("ContainersLaunched", launched, rb);
assertCounter("ContainersCompleted", completed, rb);
@ -132,8 +136,8 @@ public static void checkMetrics(int launched, int completed, int failed,
assertGauge("AllocatedVCores", allocatedVCores, rb);
assertGauge("AllocatedContainers", allocatedContainers, rb);
assertGauge("AvailableGB", availableGB, rb);
assertGauge("AvailableVCores",availableVCores, rb);
assertGauge("AvailableVCores", availableVCores, rb);
assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
assertGauge("ApplicationsRunning", applicationsRunning, rb);
}
}