YARN-2091. Add more values to ContainerExitStatus and pass it from NM to RM and then to app masters (Tsuyoshi OZAWA via bikas)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1601762 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Bikas Saha 2014-06-10 20:08:33 +00:00
parent 5de6f72054
commit ecfd43a2f1
11 changed files with 76 additions and 20 deletions

View File

@ -155,6 +155,9 @@ Release 2.5.0 - UNRELEASED
DummyApplicationResourceUsageReport for all invalid accesses.
(Ray Chiang via kasha)
YARN-2091. Add more values to ContainerExitStatus and pass it from NM to
RM and then to app masters (Tsuyoshi OZAWA via bikas)
OPTIMIZATIONS
BUG FIXES

View File

@ -46,4 +46,30 @@ public class ContainerExitStatus {
* Containers preempted by the framework.
*/
public static final int PREEMPTED = -102;
/**
* Container terminated because of exceeding allocated virtual memory.
*/
public static final int KILLED_EXCEEDED_VMEM = -103;
/**
* Container terminated because of exceeding allocated physical memory.
*/
public static final int KILLED_EXCEEDED_PMEM = -104;
/**
* Container was terminated by stop request by the app master.
*/
public static final int KILLED_BY_APPMASTER = -105;
/**
* Container was terminated by the resource manager.
*/
public static final int KILLED_BY_RESOURCEMANAGER = -106;
/**
* Container was terminated after the application finished.
*/
public static final int KILLED_AFTER_APP_COMPLETION = -107;
}

View File

@ -64,6 +64,7 @@
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersResponse;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerState;
@ -738,7 +739,8 @@ private void stopContainerInternal(NMTokenIdentifier nmTokenIdentifier,
} else {
dispatcher.getEventHandler().handle(
new ContainerKillEvent(containerID,
"Container killed by the ApplicationMaster."));
ContainerExitStatus.KILLED_BY_APPMASTER,
"Container killed by the ApplicationMaster."));
NMAuditLogger.logSuccess(container.getUser(),
AuditConstants.STOP_CONTAINER, "ContainerManageImpl", containerID
@ -887,6 +889,7 @@ public void handle(ContainerManagerEvent event) {
.getContainersToCleanup()) {
this.dispatcher.getEventHandler().handle(
new ContainerKillEvent(container,
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
"Container Killed by ResourceManager"));
}
break;

View File

@ -30,6 +30,7 @@
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.logaggregation.ContainerLogsRetentionPolicy;
@ -375,6 +376,7 @@ public ApplicationState transition(ApplicationImpl app,
for (ContainerId containerID : app.containers.keySet()) {
app.dispatcher.getEventHandler().handle(
new ContainerKillEvent(containerID,
ContainerExitStatus.KILLED_AFTER_APP_COMPLETION,
"Container killed on application-finish event: " + appEvent.getDiagnostic()));
}
return ApplicationState.FINISHING_CONTAINERS_WAIT;

View File

@ -48,7 +48,6 @@
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger;
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent;
@ -773,7 +772,7 @@ public void transition(ContainerImpl container, ContainerEvent event) {
container.cleanup();
container.metrics.endInitingContainer();
ContainerKillEvent killEvent = (ContainerKillEvent) event;
container.exitCode = ExitCode.TERMINATED.getExitCode();
container.exitCode = killEvent.getContainerExitStatus();
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
container.diagnostics.append("Container is killed before being launched.\n");
}
@ -817,6 +816,7 @@ public void transition(ContainerImpl container, ContainerEvent event) {
ContainersLauncherEventType.CLEANUP_CONTAINER));
ContainerKillEvent killEvent = (ContainerKillEvent) event;
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
container.exitCode = killEvent.getContainerExitStatus();
}
}
@ -829,7 +829,10 @@ static class ContainerKilledTransition implements
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
container.exitCode = exitEvent.getExitCode();
if (container.hasDefaultExitCode()) {
container.exitCode = exitEvent.getExitCode();
}
if (exitEvent.getDiagnosticInfo() != null) {
container.diagnostics.append(exitEvent.getDiagnosticInfo())
.append('\n');
@ -871,7 +874,7 @@ static class KillOnNewTransition extends ContainerDoneTransition {
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
ContainerKillEvent killEvent = (ContainerKillEvent) event;
container.exitCode = ExitCode.TERMINATED.getExitCode();
container.exitCode = killEvent.getContainerExitStatus();
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
container.diagnostics.append("Container is killed before being launched.\n");
super.transition(container, event);
@ -928,4 +931,9 @@ public String toString() {
this.readLock.unlock();
}
}
private boolean hasDefaultExitCode() {
return (this.exitCode == ContainerExitStatus.INVALID);
}
}

View File

@ -23,13 +23,21 @@
public class ContainerKillEvent extends ContainerEvent {
private final String diagnostic;
private final int exitStatus;
public ContainerKillEvent(ContainerId cID, String diagnostic) {
public ContainerKillEvent(ContainerId cID,
int exitStatus, String diagnostic) {
super(cID, ContainerEventType.KILL_CONTAINER);
this.exitStatus = exitStatus;
this.diagnostic = diagnostic;
}
public String getDiagnostic() {
return this.diagnostic;
}
public int getContainerExitStatus() {
return this.exitStatus;
}
}

View File

@ -30,6 +30,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
@ -403,6 +404,7 @@ public void run() {
boolean isMemoryOverLimit = false;
String msg = "";
int containerExitStatus = ContainerExitStatus.INVALID;
if (isVmemCheckEnabled()
&& isProcessTreeOverLimit(containerId.toString(),
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
@ -414,6 +416,7 @@ && isProcessTreeOverLimit(containerId.toString(),
currentPmemUsage, pmemLimit,
pId, containerId, pTree);
isMemoryOverLimit = true;
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
} else if (isPmemCheckEnabled()
&& isProcessTreeOverLimit(containerId.toString(),
currentPmemUsage, curRssMemUsageOfAgedProcesses,
@ -426,6 +429,7 @@ && isProcessTreeOverLimit(containerId.toString(),
currentPmemUsage, pmemLimit,
pId, containerId, pTree);
isMemoryOverLimit = true;
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
}
if (isMemoryOverLimit) {
@ -440,7 +444,8 @@ && isProcessTreeOverLimit(containerId.toString(),
}
// kill the container
eventDispatcher.getEventHandler().handle(
new ContainerKillEvent(containerId, msg));
new ContainerKillEvent(containerId,
containerExitStatus, msg));
it.remove();
LOG.info("Removed ProcessTree with root " + pId);
} else {

View File

@ -31,6 +31,7 @@
import java.util.List;
import java.util.Map;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.junit.Assert;
import org.apache.commons.logging.LogFactory;
@ -68,7 +69,6 @@
import org.apache.hadoop.yarn.security.NMTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestAuxServices.ServiceA;
@ -348,8 +348,7 @@ public void testContainerLaunchAndStop() throws IOException,
GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
ExitCode.TERMINATED.getExitCode();
int expectedExitCode = ContainerExitStatus.KILLED_BY_APPMASTER;
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
// Assert that the process is not alive anymore

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
@ -319,7 +320,7 @@ public void testKillOnNew() throws Exception {
assertEquals(ContainerState.NEW, wc.c.getContainerState());
wc.killContainer();
assertEquals(ContainerState.DONE, wc.c.getContainerState());
assertEquals(ExitCode.TERMINATED.getExitCode(),
assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
wc.c.cloneAndGetContainerStatus().getExitStatus());
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
.contains("KillRequest"));
@ -339,7 +340,7 @@ public void testKillOnLocalizing() throws Exception {
assertEquals(ContainerState.LOCALIZING, wc.c.getContainerState());
wc.killContainer();
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
assertEquals(ExitCode.TERMINATED.getExitCode(),
assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
wc.c.cloneAndGetContainerStatus().getExitStatus());
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
.contains("KillRequest"));
@ -898,12 +899,14 @@ public void containerFailed(int exitCode) {
}
public void killContainer() {
c.handle(new ContainerKillEvent(cId, "KillRequest"));
c.handle(new ContainerKillEvent(cId,
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
"KillRequest"));
drainDispatcherEvents();
}
public void containerKilledOnRequest() {
int exitCode = ExitCode.FORCE_KILLED.getExitCode();
int exitCode = ContainerExitStatus.KILLED_BY_RESOURCEMANAGER;
String diagnosticMsg = "Container completed with exit code " + exitCode;
c.handle(new ContainerExitEvent(cId,
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode,

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail;
@ -73,7 +74,6 @@
import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@ -604,8 +604,7 @@ public void testContainerEnvVariables() throws Exception {
GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
ExitCode.TERMINATED.getExitCode();
int expectedExitCode = ContainerExitStatus.KILLED_BY_APPMASTER;
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
// Assert that the process is not alive anymore
@ -717,7 +716,7 @@ private void internalKillTest(boolean delayed) throws Exception {
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest)
.getContainerStatuses().get(0);
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
Assert.assertEquals(ContainerExitStatus.KILLED_BY_APPMASTER,
containerStatus.getExitStatus());
// Now verify the contents of the file. Script generates a message when it

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@ -60,7 +61,6 @@
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
@ -270,7 +270,7 @@ public void testContainerKillOnMemoryOverflow() throws IOException,
GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
Assert.assertEquals(ContainerExitStatus.KILLED_EXCEEDED_VMEM,
containerStatus.getExitStatus());
String expectedMsgPattern =
"Container \\[pid=" + pid + ",containerID=" + cId