From 33293d4ba44c545ba305272a381c42ce758ffd96 Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Fri, 28 Oct 2022 11:07:01 -0700 Subject: [PATCH] YARN-11360: Add number of decommissioning/shutdown nodes to YARN cluster metrics. (#5060) (cherry picked from commit bfb84cd7f66fb6fe98809cc5d6c59864995855b1) --- .../yarn/api/records/YarnClusterMetrics.java | 26 ++++++++ .../src/main/proto/yarn_protos.proto | 2 + .../apache/hadoop/yarn/client/cli/TopCLI.java | 16 +++-- .../hadoop/yarn/client/cli/TestTopCLI.java | 66 ++++++++++++++++++- .../impl/pb/YarnClusterMetricsPBImpl.java | 33 +++++++++- .../resourcemanager/ClientRMService.java | 2 + .../resourcemanager/TestClientRMService.java | 56 ++++++++++++++++ .../clientrm/RouterYarnClientUtils.java | 6 ++ .../clientrm/TestRouterYarnClientUtils.java | 4 ++ 9 files changed, 204 insertions(+), 7 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/YarnClusterMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/YarnClusterMetrics.java index fc3edf7fb7..f460e60f48 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/YarnClusterMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/YarnClusterMetrics.java @@ -53,6 +53,20 @@ public static YarnClusterMetrics newInstance(int numNodeManagers) { @Unstable public abstract void setNumNodeManagers(int numNodeManagers); + /** + * Get the number of DecommissioningNodeManagers in the cluster. + * + * @return number of DecommissioningNodeManagers in the cluster + */ + @Public + @Unstable + public abstract int getNumDecommissioningNodeManagers(); + + @Private + @Unstable + public abstract void setNumDecommissioningNodeManagers( + int numDecommissioningNodeManagers); + /** * Get the number of DecommissionedNodeManagers in the cluster. * @@ -119,4 +133,16 @@ public abstract void setNumDecommissionedNodeManagers( @Unstable public abstract void setNumRebootedNodeManagers(int numRebootedNodeManagers); + /** + * Get the number of ShutdownNodeManagers in the cluster. + * + * @return number of ShutdownNodeManagers in the cluster + */ + @Public + @Unstable + public abstract int getNumShutdownNodeManagers(); + + @Private + @Unstable + public abstract void setNumShutdownNodeManagers(int numShutdownNodeManagers); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto index e70d471384..cba5832f84 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto @@ -571,6 +571,8 @@ message YarnClusterMetricsProto { optional int32 num_lost_nms = 4; optional int32 num_unhealthy_nms = 5; optional int32 num_rebooted_nms = 6; + optional int32 num_decommissioning_nms = 7; + optional int32 num_shutdown_nms = 8; } enum QueueStateProto { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/TopCLI.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/TopCLI.java index 79b1406ed1..59d82275e6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/TopCLI.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/TopCLI.java @@ -339,9 +339,11 @@ private static class NodesInformation { int totalNodes; int runningNodes; int unhealthyNodes; + int decommissioningNodes; int decommissionedNodes; int lostNodes; int rebootedNodes; + int shutdownNodes; } private static class QueueMetrics { @@ -696,6 +698,8 @@ protected NodesInformation getNodesInfo() { return nodeInfo; } + nodeInfo.decommissioningNodes = + yarnClusterMetrics.getNumDecommissioningNodeManagers(); nodeInfo.decommissionedNodes = yarnClusterMetrics.getNumDecommissionedNodeManagers(); nodeInfo.totalNodes = yarnClusterMetrics.getNumNodeManagers(); @@ -703,6 +707,7 @@ protected NodesInformation getNodesInfo() { nodeInfo.lostNodes = yarnClusterMetrics.getNumLostNodeManagers(); nodeInfo.unhealthyNodes = yarnClusterMetrics.getNumUnhealthyNodeManagers(); nodeInfo.rebootedNodes = yarnClusterMetrics.getNumRebootedNodeManagers(); + nodeInfo.shutdownNodes = yarnClusterMetrics.getNumShutdownNodeManagers(); return nodeInfo; } @@ -880,11 +885,11 @@ String getHeader(QueueMetrics queueMetrics, NodesInformation nodes) { ret.append(CLEAR_LINE) .append(limitLineLength(String.format( "NodeManager(s)" - + ": %d total, %d active, %d unhealthy, %d decommissioned," - + " %d lost, %d rebooted%n", + + ": %d total, %d active, %d unhealthy, %d decommissioning," + + " %d decommissioned, %d lost, %d rebooted, %d shutdown%n", nodes.totalNodes, nodes.runningNodes, nodes.unhealthyNodes, - nodes.decommissionedNodes, nodes.lostNodes, - nodes.rebootedNodes), terminalWidth, true)); + nodes.decommissioningNodes, nodes.decommissionedNodes, nodes.lostNodes, + nodes.rebootedNodes, nodes.shutdownNodes), terminalWidth, true)); ret.append(CLEAR_LINE) .append(limitLineLength(String.format( @@ -1039,7 +1044,8 @@ protected void showFieldsScreen() { } } - protected void showTopScreen() { + @VisibleForTesting + void showTopScreen() { List appsInfo = new ArrayList<>(); List apps; try { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestTopCLI.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestTopCLI.java index 706400f80d..63ebffaca4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestTopCLI.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestTopCLI.java @@ -18,7 +18,12 @@ package org.apache.hadoop.yarn.client.cli; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.PrintStream; import java.net.URL; import java.util.Arrays; import java.util.HashMap; @@ -27,9 +32,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; +import org.apache.hadoop.yarn.client.api.YarnClient; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -47,6 +56,9 @@ public class TestTopCLI { private static Map savedStaticResolution = new HashMap<>(); + private PrintStream stdout; + private PrintStream stderr; + @BeforeClass public static void initializeDummyHostnameResolution() throws Exception { String previousIpAddress; @@ -68,6 +80,18 @@ public static void restoreDummyHostnameResolution() throws Exception { } } + @Before + public void before() { + this.stdout = System.out; + this.stderr = System.err; + } + + @After + public void after() { + System.setOut(this.stdout); + System.setErr(this.stderr); + } + @Test public void testHAClusterInfoURL() throws IOException, InterruptedException { TopCLI topcli = new TopCLI(); @@ -103,4 +127,44 @@ public void testHAClusterInfoURL() throws IOException, InterruptedException { Assert.assertEquals("https", clusterUrl.getProtocol()); Assert.assertEquals(rm1Address, clusterUrl.getAuthority()); } -} \ No newline at end of file + + @Test + public void testHeaderNodeManagers() throws Exception { + YarnClusterMetrics ymetrics = mock(YarnClusterMetrics.class); + when(ymetrics.getNumNodeManagers()).thenReturn(0); + when(ymetrics.getNumDecommissioningNodeManagers()).thenReturn(1); + when(ymetrics.getNumDecommissionedNodeManagers()).thenReturn(2); + when(ymetrics.getNumActiveNodeManagers()).thenReturn(3); + when(ymetrics.getNumLostNodeManagers()).thenReturn(4); + when(ymetrics.getNumUnhealthyNodeManagers()).thenReturn(5); + when(ymetrics.getNumRebootedNodeManagers()).thenReturn(6); + when(ymetrics.getNumShutdownNodeManagers()).thenReturn(7); + + YarnClient client = mock(YarnClient.class); + when(client.getYarnClusterMetrics()).thenReturn(ymetrics); + + TopCLI topcli = new TopCLI() { + @Override protected void createAndStartYarnClient() { + } + }; + topcli.setClient(client); + topcli.terminalWidth = 200; + + String actual; + try (ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + PrintStream out = new PrintStream(outStream)) { + System.setOut(out); + System.setErr(out); + topcli.showTopScreen(); + out.flush(); + actual = outStream.toString("UTF-8"); + } + + String expected = "NodeManager(s)" + + ": 0 total, 3 active, 5 unhealthy, 1 decommissioning," + + " 2 decommissioned, 4 lost, 6 rebooted, 7 shutdown"; + Assert.assertTrue( + String.format("Expected output to contain [%s], actual output was [%s].", expected, actual), + actual.contains(expected)); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/YarnClusterMetricsPBImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/YarnClusterMetricsPBImpl.java index 14f8bff581..608d3c28af 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/YarnClusterMetricsPBImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/YarnClusterMetricsPBImpl.java @@ -89,6 +89,22 @@ public void setNumNodeManagers(int numNodeManagers) { builder.setNumNodeManagers((numNodeManagers)); } + @Override + public int getNumDecommissioningNodeManagers() { + YarnClusterMetricsProtoOrBuilder p = viaProto ? proto : builder; + if (p.hasNumDecommissioningNms()) { + return (p.getNumDecommissioningNms()); + } + return 0; + } + + @Override + public void + setNumDecommissioningNodeManagers(int numDecommissioningNodeManagers) { + maybeInitBuilder(); + builder.setNumDecommissioningNms(numDecommissioningNodeManagers); + } + @Override public int getNumDecommissionedNodeManagers() { YarnClusterMetricsProtoOrBuilder p = viaProto ? proto : builder; @@ -165,4 +181,19 @@ public void setNumRebootedNodeManagers(int numRebootedNodeManagers) { maybeInitBuilder(); builder.setNumRebootedNms((numRebootedNodeManagers)); } -} \ No newline at end of file + + @Override + public int getNumShutdownNodeManagers() { + YarnClusterMetricsProtoOrBuilder p = viaProto ? proto : builder; + if (p.hasNumShutdownNms()) { + return (p.getNumShutdownNms()); + } + return 0; + } + + @Override + public void setNumShutdownNodeManagers(int numShutdownNodeManagers) { + maybeInitBuilder(); + builder.setNumShutdownNms(numShutdownNodeManagers); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java index f008ef2a44..c725c2c0b3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClientRMService.java @@ -861,12 +861,14 @@ public GetClusterMetricsResponse getClusterMetrics( .newRecordInstance(YarnClusterMetrics.class); ymetrics.setNumNodeManagers(this.rmContext.getRMNodes().size()); ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics(); + ymetrics.setNumDecommissioningNodeManagers(clusterMetrics.getNumDecommissioningNMs()); ymetrics.setNumDecommissionedNodeManagers(clusterMetrics .getNumDecommisionedNMs()); ymetrics.setNumActiveNodeManagers(clusterMetrics.getNumActiveNMs()); ymetrics.setNumLostNodeManagers(clusterMetrics.getNumLostNMs()); ymetrics.setNumUnhealthyNodeManagers(clusterMetrics.getUnhealthyNMs()); ymetrics.setNumRebootedNodeManagers(clusterMetrics.getNumRebootedNMs()); + ymetrics.setNumShutdownNodeManagers(clusterMetrics.getNumShutdownNMs()); response.setClusterMetrics(ymetrics); return response; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java index a4629017a5..8307f88c55 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java @@ -60,6 +60,7 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.ipc.Server; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.MockApps; import org.apache.hadoop.yarn.api.ApplicationClientProtocol; @@ -76,6 +77,7 @@ import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationsResponse; import org.apache.hadoop.yarn.api.protocolrecords.GetAttributesToNodesRequest; import org.apache.hadoop.yarn.api.protocolrecords.GetAttributesToNodesResponse; +import org.apache.hadoop.yarn.api.protocolrecords.GetClusterMetricsRequest; import org.apache.hadoop.yarn.api.protocolrecords.GetClusterNodeAttributesRequest; import org.apache.hadoop.yarn.api.protocolrecords.GetClusterNodeAttributesResponse; import org.apache.hadoop.yarn.api.protocolrecords.GetClusterNodeLabelsRequest; @@ -140,6 +142,7 @@ import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Event; @@ -2809,10 +2812,63 @@ protected ClientRMService createClientRMService() { rm.close(); } + @Test + public void testGetClusterMetrics() throws Exception { + MockRM rm = new MockRM() { + protected ClientRMService createClientRMService() { + return new ClientRMService(this.rmContext, scheduler, + this.rmAppManager, this.applicationACLsManager, this.queueACLsManager, + this.getRMContext().getRMDelegationTokenSecretManager()); + }; + }; + rm.start(); + + ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics(); + clusterMetrics.incrDecommissioningNMs(); + repeat(2, clusterMetrics::incrDecommisionedNMs); + repeat(3, clusterMetrics::incrNumActiveNodes); + repeat(4, clusterMetrics::incrNumLostNMs); + repeat(5, clusterMetrics::incrNumUnhealthyNMs); + repeat(6, clusterMetrics::incrNumRebootedNMs); + repeat(7, clusterMetrics::incrNumShutdownNMs); + + // Create a client. + Configuration conf = new Configuration(); + YarnRPC rpc = YarnRPC.create(conf); + InetSocketAddress rmAddress = rm.getClientRMService().getBindAddress(); + LOG.info("Connecting to ResourceManager at " + rmAddress); + ApplicationClientProtocol client = + (ApplicationClientProtocol) rpc + .getProxy(ApplicationClientProtocol.class, rmAddress, conf); + + YarnClusterMetrics ymetrics = client.getClusterMetrics( + GetClusterMetricsRequest.newInstance()).getClusterMetrics(); + + Assert.assertEquals(0, ymetrics.getNumNodeManagers()); + Assert.assertEquals(1, ymetrics.getNumDecommissioningNodeManagers()); + Assert.assertEquals(2, ymetrics.getNumDecommissionedNodeManagers()); + Assert.assertEquals(3, ymetrics.getNumActiveNodeManagers()); + Assert.assertEquals(4, ymetrics.getNumLostNodeManagers()); + Assert.assertEquals(5, ymetrics.getNumUnhealthyNodeManagers()); + Assert.assertEquals(6, ymetrics.getNumRebootedNodeManagers()); + Assert.assertEquals(7, ymetrics.getNumShutdownNodeManagers()); + + rpc.stopProxy(client, conf); + rm.close(); + } + @After public void tearDown(){ if (resourceTypesFile != null && resourceTypesFile.exists()) { resourceTypesFile.delete(); } + ClusterMetrics.destroy(); + DefaultMetricsSystem.shutdown(); + } + + private static void repeat(int n, Runnable r) { + for (int i = 0; i < n; ++i) { + r.run(); + } } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/RouterYarnClientUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/RouterYarnClientUtils.java index 50abcf40a8..66062ba337 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/RouterYarnClientUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/RouterYarnClientUtils.java @@ -39,6 +39,9 @@ public static GetClusterMetricsResponse merge( tmp.getNumNodeManagers() + metrics.getNumNodeManagers()); tmp.setNumActiveNodeManagers( tmp.getNumActiveNodeManagers() + metrics.getNumActiveNodeManagers()); + tmp.setNumDecommissioningNodeManagers( + tmp.getNumDecommissioningNodeManagers() + metrics + .getNumDecommissioningNodeManagers()); tmp.setNumDecommissionedNodeManagers( tmp.getNumDecommissionedNodeManagers() + metrics .getNumDecommissionedNodeManagers()); @@ -49,6 +52,9 @@ public static GetClusterMetricsResponse merge( tmp.setNumUnhealthyNodeManagers( tmp.getNumUnhealthyNodeManagers() + metrics .getNumUnhealthyNodeManagers()); + tmp.setNumShutdownNodeManagers( + tmp.getNumShutdownNodeManagers() + metrics + .getNumShutdownNodeManagers()); } return GetClusterMetricsResponse.newInstance(tmp); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestRouterYarnClientUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestRouterYarnClientUtils.java index d062f9d0b5..4cc6c5a188 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestRouterYarnClientUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestRouterYarnClientUtils.java @@ -38,10 +38,12 @@ public void testClusterMetricsMerge() { YarnClusterMetrics resultMetrics = result.getClusterMetrics(); Assert.assertEquals(3, resultMetrics.getNumNodeManagers()); Assert.assertEquals(3, resultMetrics.getNumActiveNodeManagers()); + Assert.assertEquals(3, resultMetrics.getNumDecommissioningNodeManagers()); Assert.assertEquals(3, resultMetrics.getNumDecommissionedNodeManagers()); Assert.assertEquals(3, resultMetrics.getNumLostNodeManagers()); Assert.assertEquals(3, resultMetrics.getNumRebootedNodeManagers()); Assert.assertEquals(3, resultMetrics.getNumUnhealthyNodeManagers()); + Assert.assertEquals(3, resultMetrics.getNumShutdownNodeManagers()); } public GetClusterMetricsResponse getClusterMetricsResponse(int value) { @@ -49,9 +51,11 @@ public GetClusterMetricsResponse getClusterMetricsResponse(int value) { metrics.setNumUnhealthyNodeManagers(value); metrics.setNumRebootedNodeManagers(value); metrics.setNumLostNodeManagers(value); + metrics.setNumDecommissioningNodeManagers(value); metrics.setNumDecommissionedNodeManagers(value); metrics.setNumActiveNodeManagers(value); metrics.setNumNodeManagers(value); + metrics.setNumShutdownNodeManagers(value); return GetClusterMetricsResponse.newInstance(metrics); } }