From eb72628e151b70c6513723f37f87c7855a624876 Mon Sep 17 00:00:00 2001 From: Peter Bacsko Date: Mon, 17 May 2021 21:07:29 +0200 Subject: [PATCH] YARN-10258. Add metrics for 'ApplicationsRunning' in NodeManager. Contributed by ANANDA G B. --- .../containermanager/ContainerManagerImpl.java | 2 ++ .../application/ApplicationImpl.java | 3 +++ .../nodemanager/metrics/NodeManagerMetrics.java | 10 ++++++++++ .../TestContainerManagerRecovery.java | 6 +++--- .../metrics/TestNodeManagerMetrics.java | 14 +++++++++----- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index 5eb36ba558..ee1a5bf285 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -442,6 +442,7 @@ private void recoverApplication(ContainerManagerApplicationProto p) ApplicationImpl app = new ApplicationImpl(dispatcher, p.getUser(), fc, appId, creds, context, p.getAppLogAggregationInitedTime()); context.getApplications().put(appId, app); + metrics.runningApplication(); app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext)); } @@ -1137,6 +1138,7 @@ protected void startContainerInternal( applicationID, credentials, context); if (context.getApplications().putIfAbsent(applicationID, application) == null) { + metrics.runningApplication(); LOG.info("Creating a new application reference for app " + applicationID); LogAggregationContext logAggregationContext = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java index 8fe9651045..d42097430c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java @@ -623,6 +623,9 @@ static class AppLogsAggregatedTransition implements public void transition(ApplicationImpl app, ApplicationEvent event) { ApplicationId appId = event.getApplicationID(); app.context.getApplications().remove(appId); + if (null != app.context.getNodeManagerMetrics()) { + app.context.getNodeManagerMetrics().endRunningApplication(); + } app.aclsManager.removeApplication(appId); try { app.context.getNMStateStore().removeApplication(appId); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index 848b944528..a469f653eb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -100,6 +100,8 @@ public class NodeManagerMetrics { MutableGaugeFloat nodeCpuUtilization; @Metric("Current GPU utilization") MutableGaugeFloat nodeGpuUtilization; + @Metric("Current running apps") + MutableGaugeInt applicationsRunning; @Metric("Missed localization requests in bytes") MutableCounterLong localizedCacheMissBytes; @@ -187,6 +189,14 @@ public void endReInitingContainer() { containersReIniting.decr(); } + public void runningApplication() { + applicationsRunning.incr(); + } + + public void endRunningApplication() { + applicationsRunning.decr(); + } + public void pausedContainer() { containersPaused.incr(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index c67ae86f95..de29efd59f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -438,7 +438,7 @@ public void testNodeManagerMetricsRecovery() throws Exception { org.apache.hadoop.yarn.server.nodemanager .containermanager.container.ContainerState.RUNNING); TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, - 1, 1, 1, 9, 1, 7, 0F); + 1, 1, 1, 9, 1, 7, 0F, 1); // restart and verify metrics could be recovered cm.stop(); @@ -446,7 +446,7 @@ public void testNodeManagerMetricsRecovery() throws Exception { metrics = NodeManagerMetrics.create(); metrics.addResource(Resource.newInstance(10240, 8)); TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0, - 0, 0, 10, 0, 8, 0F); + 0, 0, 10, 0, 8, 0F, 0); context = createContext(conf, stateStore); cm = createContainerManager(context, delSrvc); cm.init(conf); @@ -455,7 +455,7 @@ public void testNodeManagerMetricsRecovery() throws Exception { app = context.getApplications().get(appId); assertNotNull(app); TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, - 1, 1, 1, 9, 1, 7, 0F); + 1, 1, 1, 9, 1, 7, 0F, 1); cm.stop(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java index 37454747c9..33a3ae12f1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java @@ -103,12 +103,16 @@ public void testReferenceOfSingletonJvmMetrics() { // Set node gpu utilization metrics.setNodeGpuUtilization(35.5F); + // ApplicationsRunning expected to be 1 + metrics.runningApplication(); + metrics.runningApplication(); + metrics.endRunningApplication(); + // availableGB is expected to be floored, // while allocatedGB is expected to be ceiled. // allocatedGB: 3.75GB allocated memory is shown as 4GB // availableGB: 4.25GB available memory is shown as 4GB - checkMetrics(10, 1, 1, 1, 1, - 1, 4, 7, 4, 13, 3, 35.5F); + checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3, 35.5F, 1); // Update resource and check available resource again metrics.addResource(total); @@ -120,7 +124,7 @@ public void testReferenceOfSingletonJvmMetrics() { public static void checkMetrics(int launched, int completed, int failed, int killed, int initing, int running, int allocatedGB, int allocatedContainers, int availableGB, int allocatedVCores, - int availableVCores, Float nodeGpuUtilization) { + int availableVCores, Float nodeGpuUtilization, int applicationsRunning) { MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics"); assertCounter("ContainersLaunched", launched, rb); assertCounter("ContainersCompleted", completed, rb); @@ -132,8 +136,8 @@ public static void checkMetrics(int launched, int completed, int failed, assertGauge("AllocatedVCores", allocatedVCores, rb); assertGauge("AllocatedContainers", allocatedContainers, rb); assertGauge("AvailableGB", availableGB, rb); - assertGauge("AvailableVCores",availableVCores, rb); + assertGauge("AvailableVCores", availableVCores, rb); assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb); - + assertGauge("ApplicationsRunning", applicationsRunning, rb); } }