YARN-10258. Add metrics for 'ApplicationsRunning' in NodeManager. Contributed by ANANDA G B.
This commit is contained in:
parent
8891e5c028
commit
eb72628e15
@ -442,6 +442,7 @@ private void recoverApplication(ContainerManagerApplicationProto p)
|
|||||||
ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc,
|
ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc,
|
||||||
appId, creds, context, p.getAppLogAggregationInitedTime());
|
appId, creds, context, p.getAppLogAggregationInitedTime());
|
||||||
context.getApplications().put(appId, app);
|
context.getApplications().put(appId, app);
|
||||||
|
metrics.runningApplication();
|
||||||
app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext));
|
app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1137,6 +1138,7 @@ protected void startContainerInternal(
|
|||||||
applicationID, credentials, context);
|
applicationID, credentials, context);
|
||||||
if (context.getApplications().putIfAbsent(applicationID,
|
if (context.getApplications().putIfAbsent(applicationID,
|
||||||
application) == null) {
|
application) == null) {
|
||||||
|
metrics.runningApplication();
|
||||||
LOG.info("Creating a new application reference for app "
|
LOG.info("Creating a new application reference for app "
|
||||||
+ applicationID);
|
+ applicationID);
|
||||||
LogAggregationContext logAggregationContext =
|
LogAggregationContext logAggregationContext =
|
||||||
|
@ -623,6 +623,9 @@ static class AppLogsAggregatedTransition implements
|
|||||||
public void transition(ApplicationImpl app, ApplicationEvent event) {
|
public void transition(ApplicationImpl app, ApplicationEvent event) {
|
||||||
ApplicationId appId = event.getApplicationID();
|
ApplicationId appId = event.getApplicationID();
|
||||||
app.context.getApplications().remove(appId);
|
app.context.getApplications().remove(appId);
|
||||||
|
if (null != app.context.getNodeManagerMetrics()) {
|
||||||
|
app.context.getNodeManagerMetrics().endRunningApplication();
|
||||||
|
}
|
||||||
app.aclsManager.removeApplication(appId);
|
app.aclsManager.removeApplication(appId);
|
||||||
try {
|
try {
|
||||||
app.context.getNMStateStore().removeApplication(appId);
|
app.context.getNMStateStore().removeApplication(appId);
|
||||||
|
@ -100,6 +100,8 @@ public class NodeManagerMetrics {
|
|||||||
MutableGaugeFloat nodeCpuUtilization;
|
MutableGaugeFloat nodeCpuUtilization;
|
||||||
@Metric("Current GPU utilization")
|
@Metric("Current GPU utilization")
|
||||||
MutableGaugeFloat nodeGpuUtilization;
|
MutableGaugeFloat nodeGpuUtilization;
|
||||||
|
@Metric("Current running apps")
|
||||||
|
MutableGaugeInt applicationsRunning;
|
||||||
|
|
||||||
@Metric("Missed localization requests in bytes")
|
@Metric("Missed localization requests in bytes")
|
||||||
MutableCounterLong localizedCacheMissBytes;
|
MutableCounterLong localizedCacheMissBytes;
|
||||||
@ -187,6 +189,14 @@ public void endReInitingContainer() {
|
|||||||
containersReIniting.decr();
|
containersReIniting.decr();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void runningApplication() {
|
||||||
|
applicationsRunning.incr();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void endRunningApplication() {
|
||||||
|
applicationsRunning.decr();
|
||||||
|
}
|
||||||
|
|
||||||
public void pausedContainer() {
|
public void pausedContainer() {
|
||||||
containersPaused.incr();
|
containersPaused.incr();
|
||||||
}
|
}
|
||||||
|
@ -438,7 +438,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
|
|||||||
org.apache.hadoop.yarn.server.nodemanager
|
org.apache.hadoop.yarn.server.nodemanager
|
||||||
.containermanager.container.ContainerState.RUNNING);
|
.containermanager.container.ContainerState.RUNNING);
|
||||||
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
|
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
|
||||||
1, 1, 1, 9, 1, 7, 0F);
|
1, 1, 1, 9, 1, 7, 0F, 1);
|
||||||
|
|
||||||
// restart and verify metrics could be recovered
|
// restart and verify metrics could be recovered
|
||||||
cm.stop();
|
cm.stop();
|
||||||
@ -446,7 +446,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
|
|||||||
metrics = NodeManagerMetrics.create();
|
metrics = NodeManagerMetrics.create();
|
||||||
metrics.addResource(Resource.newInstance(10240, 8));
|
metrics.addResource(Resource.newInstance(10240, 8));
|
||||||
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
|
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 10, 0, 8, 0F);
|
0, 0, 10, 0, 8, 0F, 0);
|
||||||
context = createContext(conf, stateStore);
|
context = createContext(conf, stateStore);
|
||||||
cm = createContainerManager(context, delSrvc);
|
cm = createContainerManager(context, delSrvc);
|
||||||
cm.init(conf);
|
cm.init(conf);
|
||||||
@ -455,7 +455,7 @@ public void testNodeManagerMetricsRecovery() throws Exception {
|
|||||||
app = context.getApplications().get(appId);
|
app = context.getApplications().get(appId);
|
||||||
assertNotNull(app);
|
assertNotNull(app);
|
||||||
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
|
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
|
||||||
1, 1, 1, 9, 1, 7, 0F);
|
1, 1, 1, 9, 1, 7, 0F, 1);
|
||||||
cm.stop();
|
cm.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -103,12 +103,16 @@ public void testReferenceOfSingletonJvmMetrics() {
|
|||||||
// Set node gpu utilization
|
// Set node gpu utilization
|
||||||
metrics.setNodeGpuUtilization(35.5F);
|
metrics.setNodeGpuUtilization(35.5F);
|
||||||
|
|
||||||
|
// ApplicationsRunning expected to be 1
|
||||||
|
metrics.runningApplication();
|
||||||
|
metrics.runningApplication();
|
||||||
|
metrics.endRunningApplication();
|
||||||
|
|
||||||
// availableGB is expected to be floored,
|
// availableGB is expected to be floored,
|
||||||
// while allocatedGB is expected to be ceiled.
|
// while allocatedGB is expected to be ceiled.
|
||||||
// allocatedGB: 3.75GB allocated memory is shown as 4GB
|
// allocatedGB: 3.75GB allocated memory is shown as 4GB
|
||||||
// availableGB: 4.25GB available memory is shown as 4GB
|
// availableGB: 4.25GB available memory is shown as 4GB
|
||||||
checkMetrics(10, 1, 1, 1, 1,
|
checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3, 35.5F, 1);
|
||||||
1, 4, 7, 4, 13, 3, 35.5F);
|
|
||||||
|
|
||||||
// Update resource and check available resource again
|
// Update resource and check available resource again
|
||||||
metrics.addResource(total);
|
metrics.addResource(total);
|
||||||
@ -120,7 +124,7 @@ public void testReferenceOfSingletonJvmMetrics() {
|
|||||||
public static void checkMetrics(int launched, int completed, int failed,
|
public static void checkMetrics(int launched, int completed, int failed,
|
||||||
int killed, int initing, int running, int allocatedGB,
|
int killed, int initing, int running, int allocatedGB,
|
||||||
int allocatedContainers, int availableGB, int allocatedVCores,
|
int allocatedContainers, int availableGB, int allocatedVCores,
|
||||||
int availableVCores, Float nodeGpuUtilization) {
|
int availableVCores, Float nodeGpuUtilization, int applicationsRunning) {
|
||||||
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
|
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
|
||||||
assertCounter("ContainersLaunched", launched, rb);
|
assertCounter("ContainersLaunched", launched, rb);
|
||||||
assertCounter("ContainersCompleted", completed, rb);
|
assertCounter("ContainersCompleted", completed, rb);
|
||||||
@ -132,8 +136,8 @@ public static void checkMetrics(int launched, int completed, int failed,
|
|||||||
assertGauge("AllocatedVCores", allocatedVCores, rb);
|
assertGauge("AllocatedVCores", allocatedVCores, rb);
|
||||||
assertGauge("AllocatedContainers", allocatedContainers, rb);
|
assertGauge("AllocatedContainers", allocatedContainers, rb);
|
||||||
assertGauge("AvailableGB", availableGB, rb);
|
assertGauge("AvailableGB", availableGB, rb);
|
||||||
assertGauge("AvailableVCores",availableVCores, rb);
|
assertGauge("AvailableVCores", availableVCores, rb);
|
||||||
assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
|
assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
|
||||||
|
assertGauge("ApplicationsRunning", applicationsRunning, rb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user