YARN-10258. Add metrics for 'ApplicationsRunning' in NodeManager. Contributed by ANANDA G B.
This commit is contained in:
parent
8891e5c028
commit
eb72628e15
@ -442,6 +442,7 @@ public class ContainerManagerImpl extends CompositeService implements
|
||||
ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc,
|
||||
appId, creds, context, p.getAppLogAggregationInitedTime());
|
||||
context.getApplications().put(appId, app);
|
||||
metrics.runningApplication();
|
||||
app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext));
|
||||
}
|
||||
|
||||
@ -1137,6 +1138,7 @@ public class ContainerManagerImpl extends CompositeService implements
|
||||
applicationID, credentials, context);
|
||||
if (context.getApplications().putIfAbsent(applicationID,
|
||||
application) == null) {
|
||||
metrics.runningApplication();
|
||||
LOG.info("Creating a new application reference for app "
|
||||
+ applicationID);
|
||||
LogAggregationContext logAggregationContext =
|
||||
|
@ -623,6 +623,9 @@ public class ApplicationImpl implements Application {
|
||||
public void transition(ApplicationImpl app, ApplicationEvent event) {
|
||||
ApplicationId appId = event.getApplicationID();
|
||||
app.context.getApplications().remove(appId);
|
||||
if (null != app.context.getNodeManagerMetrics()) {
|
||||
app.context.getNodeManagerMetrics().endRunningApplication();
|
||||
}
|
||||
app.aclsManager.removeApplication(appId);
|
||||
try {
|
||||
app.context.getNMStateStore().removeApplication(appId);
|
||||
|
@ -100,6 +100,8 @@ public class NodeManagerMetrics {
|
||||
MutableGaugeFloat nodeCpuUtilization;
|
||||
@Metric("Current GPU utilization")
|
||||
MutableGaugeFloat nodeGpuUtilization;
|
||||
@Metric("Current running apps")
|
||||
MutableGaugeInt applicationsRunning;
|
||||
|
||||
@Metric("Missed localization requests in bytes")
|
||||
MutableCounterLong localizedCacheMissBytes;
|
||||
@ -187,6 +189,14 @@ public class NodeManagerMetrics {
|
||||
containersReIniting.decr();
|
||||
}
|
||||
|
||||
public void runningApplication() {
|
||||
applicationsRunning.incr();
|
||||
}
|
||||
|
||||
public void endRunningApplication() {
|
||||
applicationsRunning.decr();
|
||||
}
|
||||
|
||||
public void pausedContainer() {
|
||||
containersPaused.incr();
|
||||
}
|
||||
|
@ -438,7 +438,7 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
||||
org.apache.hadoop.yarn.server.nodemanager
|
||||
.containermanager.container.ContainerState.RUNNING);
|
||||
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
|
||||
1, 1, 1, 9, 1, 7, 0F);
|
||||
1, 1, 1, 9, 1, 7, 0F, 1);
|
||||
|
||||
// restart and verify metrics could be recovered
|
||||
cm.stop();
|
||||
@ -446,7 +446,7 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
||||
metrics = NodeManagerMetrics.create();
|
||||
metrics.addResource(Resource.newInstance(10240, 8));
|
||||
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
|
||||
0, 0, 10, 0, 8, 0F);
|
||||
0, 0, 10, 0, 8, 0F, 0);
|
||||
context = createContext(conf, stateStore);
|
||||
cm = createContainerManager(context, delSrvc);
|
||||
cm.init(conf);
|
||||
@ -455,7 +455,7 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
||||
app = context.getApplications().get(appId);
|
||||
assertNotNull(app);
|
||||
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
|
||||
1, 1, 1, 9, 1, 7, 0F);
|
||||
1, 1, 1, 9, 1, 7, 0F, 1);
|
||||
cm.stop();
|
||||
}
|
||||
|
||||
|
@ -103,12 +103,16 @@ public class TestNodeManagerMetrics {
|
||||
// Set node gpu utilization
|
||||
metrics.setNodeGpuUtilization(35.5F);
|
||||
|
||||
// ApplicationsRunning expected to be 1
|
||||
metrics.runningApplication();
|
||||
metrics.runningApplication();
|
||||
metrics.endRunningApplication();
|
||||
|
||||
// availableGB is expected to be floored,
|
||||
// while allocatedGB is expected to be ceiled.
|
||||
// allocatedGB: 3.75GB allocated memory is shown as 4GB
|
||||
// availableGB: 4.25GB available memory is shown as 4GB
|
||||
checkMetrics(10, 1, 1, 1, 1,
|
||||
1, 4, 7, 4, 13, 3, 35.5F);
|
||||
checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3, 35.5F, 1);
|
||||
|
||||
// Update resource and check available resource again
|
||||
metrics.addResource(total);
|
||||
@ -120,7 +124,7 @@ public class TestNodeManagerMetrics {
|
||||
public static void checkMetrics(int launched, int completed, int failed,
|
||||
int killed, int initing, int running, int allocatedGB,
|
||||
int allocatedContainers, int availableGB, int allocatedVCores,
|
||||
int availableVCores, Float nodeGpuUtilization) {
|
||||
int availableVCores, Float nodeGpuUtilization, int applicationsRunning) {
|
||||
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
|
||||
assertCounter("ContainersLaunched", launched, rb);
|
||||
assertCounter("ContainersCompleted", completed, rb);
|
||||
@ -132,8 +136,8 @@ public class TestNodeManagerMetrics {
|
||||
assertGauge("AllocatedVCores", allocatedVCores, rb);
|
||||
assertGauge("AllocatedContainers", allocatedContainers, rb);
|
||||
assertGauge("AvailableGB", availableGB, rb);
|
||||
assertGauge("AvailableVCores",availableVCores, rb);
|
||||
assertGauge("AvailableVCores", availableVCores, rb);
|
||||
assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
|
||||
|
||||
assertGauge("ApplicationsRunning", applicationsRunning, rb);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user