YARN-10258. Add metrics for 'ApplicationsRunning' in NodeManager. Contributed by ANANDA G B.

This commit is contained in:
Peter Bacsko 2021-05-17 21:07:29 +02:00
parent 8891e5c028
commit eb72628e15
5 changed files with 27 additions and 8 deletions

View File

@ -442,6 +442,7 @@ private void recoverApplication(ContainerManagerApplicationProto p)
ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc, ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc,
appId, creds, context, p.getAppLogAggregationInitedTime()); appId, creds, context, p.getAppLogAggregationInitedTime());
context.getApplications().put(appId, app); context.getApplications().put(appId, app);
metrics.runningApplication();
app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext)); app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext));
} }
@ -1137,6 +1138,7 @@ protected void startContainerInternal(
applicationID, credentials, context); applicationID, credentials, context);
if (context.getApplications().putIfAbsent(applicationID, if (context.getApplications().putIfAbsent(applicationID,
application) == null) { application) == null) {
metrics.runningApplication();
LOG.info("Creating a new application reference for app " LOG.info("Creating a new application reference for app "
+ applicationID); + applicationID);
LogAggregationContext logAggregationContext = LogAggregationContext logAggregationContext =

View File

@ -623,6 +623,9 @@ static class AppLogsAggregatedTransition implements
public void transition(ApplicationImpl app, ApplicationEvent event) { public void transition(ApplicationImpl app, ApplicationEvent event) {
ApplicationId appId = event.getApplicationID(); ApplicationId appId = event.getApplicationID();
app.context.getApplications().remove(appId); app.context.getApplications().remove(appId);
if (null != app.context.getNodeManagerMetrics()) {
app.context.getNodeManagerMetrics().endRunningApplication();
}
app.aclsManager.removeApplication(appId); app.aclsManager.removeApplication(appId);
try { try {
app.context.getNMStateStore().removeApplication(appId); app.context.getNMStateStore().removeApplication(appId);

View File

@ -100,6 +100,8 @@ public class NodeManagerMetrics {
MutableGaugeFloat nodeCpuUtilization; MutableGaugeFloat nodeCpuUtilization;
@Metric("Current GPU utilization") @Metric("Current GPU utilization")
MutableGaugeFloat nodeGpuUtilization; MutableGaugeFloat nodeGpuUtilization;
@Metric("Current running apps")
MutableGaugeInt applicationsRunning;
@Metric("Missed localization requests in bytes") @Metric("Missed localization requests in bytes")
MutableCounterLong localizedCacheMissBytes; MutableCounterLong localizedCacheMissBytes;
@ -187,6 +189,14 @@ public void endReInitingContainer() {
containersReIniting.decr(); containersReIniting.decr();
} }
public void runningApplication() {
applicationsRunning.incr();
}
public void endRunningApplication() {
applicationsRunning.decr();
}
public void pausedContainer() { public void pausedContainer() {
containersPaused.incr(); containersPaused.incr();
} }

View File

@ -438,7 +438,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
org.apache.hadoop.yarn.server.nodemanager org.apache.hadoop.yarn.server.nodemanager
.containermanager.container.ContainerState.RUNNING); .containermanager.container.ContainerState.RUNNING);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
1, 1, 1, 9, 1, 7, 0F); 1, 1, 1, 9, 1, 7, 0F, 1);
// restart and verify metrics could be recovered // restart and verify metrics could be recovered
cm.stop(); cm.stop();
@ -446,7 +446,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
metrics = NodeManagerMetrics.create(); metrics = NodeManagerMetrics.create();
metrics.addResource(Resource.newInstance(10240, 8)); metrics.addResource(Resource.newInstance(10240, 8));
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0, TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
0, 0, 10, 0, 8, 0F); 0, 0, 10, 0, 8, 0F, 0);
context = createContext(conf, stateStore); context = createContext(conf, stateStore);
cm = createContainerManager(context, delSrvc); cm = createContainerManager(context, delSrvc);
cm.init(conf); cm.init(conf);
@ -455,7 +455,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
app = context.getApplications().get(appId); app = context.getApplications().get(appId);
assertNotNull(app); assertNotNull(app);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
1, 1, 1, 9, 1, 7, 0F); 1, 1, 1, 9, 1, 7, 0F, 1);
cm.stop(); cm.stop();
} }

View File

@ -103,12 +103,16 @@ public void testReferenceOfSingletonJvmMetrics() {
// Set node gpu utilization // Set node gpu utilization
metrics.setNodeGpuUtilization(35.5F); metrics.setNodeGpuUtilization(35.5F);
// ApplicationsRunning expected to be 1
metrics.runningApplication();
metrics.runningApplication();
metrics.endRunningApplication();
// availableGB is expected to be floored, // availableGB is expected to be floored,
// while allocatedGB is expected to be ceiled. // while allocatedGB is expected to be ceiled.
// allocatedGB: 3.75GB allocated memory is shown as 4GB // allocatedGB: 3.75GB allocated memory is shown as 4GB
// availableGB: 4.25GB available memory is shown as 4GB // availableGB: 4.25GB available memory is shown as 4GB
checkMetrics(10, 1, 1, 1, 1, checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3, 35.5F, 1);
1, 4, 7, 4, 13, 3, 35.5F);
// Update resource and check available resource again // Update resource and check available resource again
metrics.addResource(total); metrics.addResource(total);
@ -120,7 +124,7 @@ public void testReferenceOfSingletonJvmMetrics() {
public static void checkMetrics(int launched, int completed, int failed, public static void checkMetrics(int launched, int completed, int failed,
int killed, int initing, int running, int allocatedGB, int killed, int initing, int running, int allocatedGB,
int allocatedContainers, int availableGB, int allocatedVCores, int allocatedContainers, int availableGB, int allocatedVCores,
int availableVCores, Float nodeGpuUtilization) { int availableVCores, Float nodeGpuUtilization, int applicationsRunning) {
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics"); MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
assertCounter("ContainersLaunched", launched, rb); assertCounter("ContainersLaunched", launched, rb);
assertCounter("ContainersCompleted", completed, rb); assertCounter("ContainersCompleted", completed, rb);
@ -132,8 +136,8 @@ public static void checkMetrics(int launched, int completed, int failed,
assertGauge("AllocatedVCores", allocatedVCores, rb); assertGauge("AllocatedVCores", allocatedVCores, rb);
assertGauge("AllocatedContainers", allocatedContainers, rb); assertGauge("AllocatedContainers", allocatedContainers, rb);
assertGauge("AvailableGB", availableGB, rb); assertGauge("AvailableGB", availableGB, rb);
assertGauge("AvailableVCores",availableVCores, rb); assertGauge("AvailableVCores", availableVCores, rb);
assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb); assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
assertGauge("ApplicationsRunning", applicationsRunning, rb);
} }
} }