From ab2bda57bd9ad617342586d5769121a4fef4eab1 Mon Sep 17 00:00:00 2001 From: Giovanni Matteo Fumarola Date: Mon, 1 Apr 2019 14:21:17 -0700 Subject: [PATCH] YARN-9428. Add metrics for paused containers in NodeManager. Contributed by Abhishek Modi. --- .../containermanager/container/ContainerImpl.java | 15 +++++++++++++++ .../nodemanager/metrics/NodeManagerMetrics.java | 13 +++++++++++++ .../containermanager/container/TestContainer.java | 3 +++ 3 files changed, 31 insertions(+) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 00e6aa77b5..cfade27f93 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -161,6 +161,7 @@ private ReInitializationContext createContextForRollback() { private final StringBuilder diagnostics; private final int diagnosticsMaxSize; private boolean wasLaunched; + private boolean wasPaused; private long containerLocalizationStartTime; private long containerLaunchStartTime; private ContainerMetrics containerMetrics; @@ -1541,6 +1542,7 @@ static class RecoveredContainerTransition extends ContainerTransition { public void transition(ContainerImpl container, ContainerEvent event) { container.sendContainerMonitorStartEvent(); container.wasLaunched = true; + container.setIsPaused(true); } } @@ -1561,6 +1563,7 @@ public ExitedWithSuccessTransition(boolean clCleanupRequired) { public void transition(ContainerImpl container, ContainerEvent event) { container.setIsReInitializing(false); + container.setIsPaused(false); // Set exit code to 0 on success container.exitCode = 0; @@ -1591,6 +1594,7 @@ public ExitedWithFailureTransition(boolean clCleanupRequired) { @Override public void transition(ContainerImpl container, ContainerEvent event) { + container.setIsPaused(false); container.setIsReInitializing(false); ContainerExitEvent exitEvent = (ContainerExitEvent) event; container.exitCode = exitEvent.getExitCode(); @@ -1835,6 +1839,7 @@ static class KillTransition implements public void transition(ContainerImpl container, ContainerEvent event) { // Kill the process/process-grp container.setIsReInitializing(false); + container.setIsPaused(false); container.dispatcher.getEventHandler().handle( new ContainersLauncherEvent(container, ContainersLauncherEventType.CLEANUP_CONTAINER)); @@ -2080,6 +2085,8 @@ static class PausedContainerTransition implements SingleArcTransition { @Override public void transition(ContainerImpl container, ContainerEvent event) { + container.setIsPaused(true); + container.metrics.pausedContainer(); // Container was PAUSED so tell the scheduler container.dispatcher.getEventHandler().handle( new ContainerSchedulerEvent(container, @@ -2096,6 +2103,7 @@ static class ResumeContainerTransition implements SingleArcTransition { @Override public void transition(ContainerImpl container, ContainerEvent event) { + container.setIsPaused(false); // Pause the process/process-grp if it is supported by the container container.dispatcher.getEventHandler().handle( new ContainersLauncherEvent(container, @@ -2154,6 +2162,13 @@ private static boolean shouldBeUploadedToSharedCache(ContainerImpl container, return container.resourceSet.getResourcesUploadPolicies().get(resource); } + private void setIsPaused(boolean paused) { + if (this.wasPaused && !paused) { + this.metrics.endPausedContainer(); + } + this.wasPaused = paused; + } + @VisibleForTesting ContainerRetryContext getContainerRetryContext() { return containerRetryContext; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index 823a9d9d2f..8ecc1a17ca 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -44,6 +44,7 @@ public class NodeManagerMetrics { @Metric("# of initializing containers") MutableGaugeInt containersIniting; @Metric MutableGaugeInt containersRunning; + @Metric("# of paused containers") MutableGaugeInt containersPaused; @Metric("Current allocated memory in GB") MutableGaugeInt allocatedGB; @Metric("Current # of allocated containers") @@ -168,6 +169,14 @@ public void endReInitingContainer() { containersReIniting.decr(); } + public void pausedContainer() { + containersPaused.incr(); + } + + public void endPausedContainer() { + containersPaused.decr(); + } + public void allocateContainer(Resource res) { allocatedContainers.incr(); allocatedMB = allocatedMB + res.getMemorySize(); @@ -268,6 +277,10 @@ public int getRunningContainers() { return containersRunning.value(); } + public int getPausedContainers() { + return containersPaused.value(); + } + @VisibleForTesting public int getKilledContainers() { return containersKilled.value(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java index 4d7559ec23..ea3acca35e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java @@ -246,13 +246,16 @@ public void testContainerPauseAndResume() throws Exception { wc.initContainer(); wc.localizeResources(); int running = metrics.getRunningContainers(); + int paused = metrics.getPausedContainers(); wc.launchContainer(); assertEquals(running + 1, metrics.getRunningContainers()); reset(wc.localizerBus); wc.pauseContainer(); assertEquals(ContainerState.PAUSED, wc.c.getContainerState()); + assertEquals(paused + 1, metrics.getPausedContainers()); wc.resumeContainer(); + assertEquals(paused, metrics.getPausedContainers()); assertEquals(ContainerState.RUNNING, wc.c.getContainerState()); wc.containerKilledOnRequest();