diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 4775bcb0ea..1cb9ea0639 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -173,6 +173,8 @@ Release 2.4.0 - UNRELEASED YARN-1846. TestRM#testNMTokenSentForNormalContainer assumes CapacityScheduler. (Robert Kanter via kasha) + YARN-1705. Reset cluster-metrics on transition to standby. (Rohith via kasha) + IMPROVEMENTS YARN-1007. Enhance History Reader interface for Containers. (Mayank Bansal via diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 12a533b6c8..ad6477cc28 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -81,6 +81,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; @@ -840,6 +841,8 @@ void stopActiveServices() throws Exception { rmContext.getRMNodes().clear(); rmContext.getInactiveRMNodes().clear(); rmContext.getRMApps().clear(); + ClusterMetrics.destroy(); + QueueMetrics.clearQueueMetrics(); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java index a0519e2fa0..507b798a56 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java @@ -127,7 +127,7 @@ static QueueMetrics forQueue(String queueName, Queue parent, } /** - * Helper method to clear cache - used only for unit tests. + * Helper method to clear cache. */ @Private public synchronized static void clearQueueMetrics() { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java index c0427aa3cb..62ce51e46a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java @@ -32,6 +32,7 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.junit.Before; import org.junit.Test; @@ -138,32 +139,38 @@ public void testStartAndTransitions() throws IOException { rm.start(); checkMonitorHealth(); checkStandbyRMFunctionality(); - + verifyClusterMetrics(0, 0, 0, 0, 0, 0); + // 1. Transition to Standby - must be a no-op rm.adminService.transitionToStandby(requestInfo); checkMonitorHealth(); checkStandbyRMFunctionality(); - + verifyClusterMetrics(0, 0, 0, 0, 0, 0); + // 2. Transition to active rm.adminService.transitionToActive(requestInfo); checkMonitorHealth(); checkActiveRMFunctionality(); - + verifyClusterMetrics(1, 1, 1, 1, 2048, 1); + // 3. Transition to active - no-op rm.adminService.transitionToActive(requestInfo); checkMonitorHealth(); checkActiveRMFunctionality(); - + verifyClusterMetrics(1, 2, 2, 2, 2048, 2); + // 4. Transition to standby rm.adminService.transitionToStandby(requestInfo); checkMonitorHealth(); checkStandbyRMFunctionality(); - + verifyClusterMetrics(0, 0, 0, 0, 0, 0); + // 5. Transition to active to check Active->Standby->Active works rm.adminService.transitionToActive(requestInfo); checkMonitorHealth(); checkActiveRMFunctionality(); - + verifyClusterMetrics(1, 1, 1, 1, 2048, 1); + // 6. Stop the RM. All services should stop and RM should not be ready to // become active rm.stop(); @@ -367,6 +374,27 @@ public void testHAWithRMHostName() { fail("Should not throw any exceptions."); } } + + private void verifyClusterMetrics(int activeNodes, int appsSubmitted, + int appsPending, int containersPending, int availableMB, + int activeApplications) { + QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics(); + // verify queue metrics + assertMetric("appsSubmitted", appsSubmitted, metrics.getAppsSubmitted()); + assertMetric("appsPending", appsPending, metrics.getAppsPending()); + assertMetric("containersPending", containersPending, + metrics.getPendingContainers()); + assertMetric("availableMB", availableMB, metrics.getAvailableMB()); + assertMetric("activeApplications", activeApplications, + metrics.getActiveApps()); + // verify node metric + ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics(); + assertMetric("activeNodes", activeNodes, clusterMetrics.getNumActiveNMs()); + } + + private void assertMetric(String metricName, int expected, int actual) { + assertEquals("Incorrect value for metric " + metricName, expected, actual); + } @SuppressWarnings("rawtypes") class MyCountingDispatcher extends AbstractService implements Dispatcher {