YARN-9428. Add metrics for paused containers in NodeManager. Contributed by Abhishek Modi.
This commit is contained in:
parent
da7f8c244d
commit
ab2bda57bd
@ -161,6 +161,7 @@ private ReInitializationContext createContextForRollback() {
|
||||
private final StringBuilder diagnostics;
|
||||
private final int diagnosticsMaxSize;
|
||||
private boolean wasLaunched;
|
||||
private boolean wasPaused;
|
||||
private long containerLocalizationStartTime;
|
||||
private long containerLaunchStartTime;
|
||||
private ContainerMetrics containerMetrics;
|
||||
@ -1541,6 +1542,7 @@ static class RecoveredContainerTransition extends ContainerTransition {
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
container.sendContainerMonitorStartEvent();
|
||||
container.wasLaunched = true;
|
||||
container.setIsPaused(true);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1561,6 +1563,7 @@ public ExitedWithSuccessTransition(boolean clCleanupRequired) {
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
|
||||
container.setIsReInitializing(false);
|
||||
container.setIsPaused(false);
|
||||
// Set exit code to 0 on success
|
||||
container.exitCode = 0;
|
||||
|
||||
@ -1591,6 +1594,7 @@ public ExitedWithFailureTransition(boolean clCleanupRequired) {
|
||||
|
||||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
container.setIsPaused(false);
|
||||
container.setIsReInitializing(false);
|
||||
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
|
||||
container.exitCode = exitEvent.getExitCode();
|
||||
@ -1835,6 +1839,7 @@ static class KillTransition implements
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
// Kill the process/process-grp
|
||||
container.setIsReInitializing(false);
|
||||
container.setIsPaused(false);
|
||||
container.dispatcher.getEventHandler().handle(
|
||||
new ContainersLauncherEvent(container,
|
||||
ContainersLauncherEventType.CLEANUP_CONTAINER));
|
||||
@ -2080,6 +2085,8 @@ static class PausedContainerTransition implements
|
||||
SingleArcTransition<ContainerImpl, ContainerEvent> {
|
||||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
container.setIsPaused(true);
|
||||
container.metrics.pausedContainer();
|
||||
// Container was PAUSED so tell the scheduler
|
||||
container.dispatcher.getEventHandler().handle(
|
||||
new ContainerSchedulerEvent(container,
|
||||
@ -2096,6 +2103,7 @@ static class ResumeContainerTransition implements
|
||||
SingleArcTransition<ContainerImpl, ContainerEvent> {
|
||||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
container.setIsPaused(false);
|
||||
// Pause the process/process-grp if it is supported by the container
|
||||
container.dispatcher.getEventHandler().handle(
|
||||
new ContainersLauncherEvent(container,
|
||||
@ -2154,6 +2162,13 @@ private static boolean shouldBeUploadedToSharedCache(ContainerImpl container,
|
||||
return container.resourceSet.getResourcesUploadPolicies().get(resource);
|
||||
}
|
||||
|
||||
private void setIsPaused(boolean paused) {
|
||||
if (this.wasPaused && !paused) {
|
||||
this.metrics.endPausedContainer();
|
||||
}
|
||||
this.wasPaused = paused;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
ContainerRetryContext getContainerRetryContext() {
|
||||
return containerRetryContext;
|
||||
|
@ -44,6 +44,7 @@ public class NodeManagerMetrics {
|
||||
@Metric("# of initializing containers")
|
||||
MutableGaugeInt containersIniting;
|
||||
@Metric MutableGaugeInt containersRunning;
|
||||
@Metric("# of paused containers") MutableGaugeInt containersPaused;
|
||||
@Metric("Current allocated memory in GB")
|
||||
MutableGaugeInt allocatedGB;
|
||||
@Metric("Current # of allocated containers")
|
||||
@ -168,6 +169,14 @@ public void endReInitingContainer() {
|
||||
containersReIniting.decr();
|
||||
}
|
||||
|
||||
public void pausedContainer() {
|
||||
containersPaused.incr();
|
||||
}
|
||||
|
||||
public void endPausedContainer() {
|
||||
containersPaused.decr();
|
||||
}
|
||||
|
||||
public void allocateContainer(Resource res) {
|
||||
allocatedContainers.incr();
|
||||
allocatedMB = allocatedMB + res.getMemorySize();
|
||||
@ -268,6 +277,10 @@ public int getRunningContainers() {
|
||||
return containersRunning.value();
|
||||
}
|
||||
|
||||
public int getPausedContainers() {
|
||||
return containersPaused.value();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public int getKilledContainers() {
|
||||
return containersKilled.value();
|
||||
|
@ -246,13 +246,16 @@ public void testContainerPauseAndResume() throws Exception {
|
||||
wc.initContainer();
|
||||
wc.localizeResources();
|
||||
int running = metrics.getRunningContainers();
|
||||
int paused = metrics.getPausedContainers();
|
||||
wc.launchContainer();
|
||||
assertEquals(running + 1, metrics.getRunningContainers());
|
||||
reset(wc.localizerBus);
|
||||
wc.pauseContainer();
|
||||
assertEquals(ContainerState.PAUSED,
|
||||
wc.c.getContainerState());
|
||||
assertEquals(paused + 1, metrics.getPausedContainers());
|
||||
wc.resumeContainer();
|
||||
assertEquals(paused, metrics.getPausedContainers());
|
||||
assertEquals(ContainerState.RUNNING,
|
||||
wc.c.getContainerState());
|
||||
wc.containerKilledOnRequest();
|
||||
|
Loading…
Reference in New Issue
Block a user