YARN-2091. Add more values to ContainerExitStatus and pass it from NM to RM and then to app masters (Tsuyoshi OZAWA via bikas)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1601762 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5de6f72054
commit
ecfd43a2f1
@ -155,6 +155,9 @@ Release 2.5.0 - UNRELEASED
|
||||
DummyApplicationResourceUsageReport for all invalid accesses.
|
||||
(Ray Chiang via kasha)
|
||||
|
||||
YARN-2091. Add more values to ContainerExitStatus and pass it from NM to
|
||||
RM and then to app masters (Tsuyoshi OZAWA via bikas)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
@ -46,4 +46,30 @@ public class ContainerExitStatus {
|
||||
* Containers preempted by the framework.
|
||||
*/
|
||||
public static final int PREEMPTED = -102;
|
||||
|
||||
/**
|
||||
* Container terminated because of exceeding allocated virtual memory.
|
||||
*/
|
||||
public static final int KILLED_EXCEEDED_VMEM = -103;
|
||||
|
||||
/**
|
||||
* Container terminated because of exceeding allocated physical memory.
|
||||
*/
|
||||
public static final int KILLED_EXCEEDED_PMEM = -104;
|
||||
|
||||
/**
|
||||
* Container was terminated by stop request by the app master.
|
||||
*/
|
||||
public static final int KILLED_BY_APPMASTER = -105;
|
||||
|
||||
/**
|
||||
* Container was terminated by the resource manager.
|
||||
*/
|
||||
public static final int KILLED_BY_RESOURCEMANAGER = -106;
|
||||
|
||||
/**
|
||||
* Container was terminated after the application finished.
|
||||
*/
|
||||
public static final int KILLED_AFTER_APP_COMPLETION = -107;
|
||||
|
||||
}
|
||||
|
@ -64,6 +64,7 @@
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersResponse;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||
@ -738,7 +739,8 @@ private void stopContainerInternal(NMTokenIdentifier nmTokenIdentifier,
|
||||
} else {
|
||||
dispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(containerID,
|
||||
"Container killed by the ApplicationMaster."));
|
||||
ContainerExitStatus.KILLED_BY_APPMASTER,
|
||||
"Container killed by the ApplicationMaster."));
|
||||
|
||||
NMAuditLogger.logSuccess(container.getUser(),
|
||||
AuditConstants.STOP_CONTAINER, "ContainerManageImpl", containerID
|
||||
@ -887,6 +889,7 @@ public void handle(ContainerManagerEvent event) {
|
||||
.getContainersToCleanup()) {
|
||||
this.dispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(container,
|
||||
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
"Container Killed by ResourceManager"));
|
||||
}
|
||||
break;
|
||||
|
@ -30,6 +30,7 @@
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.logaggregation.ContainerLogsRetentionPolicy;
|
||||
@ -375,6 +376,7 @@ public ApplicationState transition(ApplicationImpl app,
|
||||
for (ContainerId containerID : app.containers.keySet()) {
|
||||
app.dispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(containerID,
|
||||
ContainerExitStatus.KILLED_AFTER_APP_COMPLETION,
|
||||
"Container killed on application-finish event: " + appEvent.getDiagnostic()));
|
||||
}
|
||||
return ApplicationState.FINISHING_CONTAINERS_WAIT;
|
||||
|
@ -48,7 +48,6 @@
|
||||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent;
|
||||
@ -773,7 +772,7 @@ public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
container.cleanup();
|
||||
container.metrics.endInitingContainer();
|
||||
ContainerKillEvent killEvent = (ContainerKillEvent) event;
|
||||
container.exitCode = ExitCode.TERMINATED.getExitCode();
|
||||
container.exitCode = killEvent.getContainerExitStatus();
|
||||
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
|
||||
container.diagnostics.append("Container is killed before being launched.\n");
|
||||
}
|
||||
@ -817,6 +816,7 @@ public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
ContainersLauncherEventType.CLEANUP_CONTAINER));
|
||||
ContainerKillEvent killEvent = (ContainerKillEvent) event;
|
||||
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
|
||||
container.exitCode = killEvent.getContainerExitStatus();
|
||||
}
|
||||
}
|
||||
|
||||
@ -829,7 +829,10 @@ static class ContainerKilledTransition implements
|
||||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
|
||||
container.exitCode = exitEvent.getExitCode();
|
||||
if (container.hasDefaultExitCode()) {
|
||||
container.exitCode = exitEvent.getExitCode();
|
||||
}
|
||||
|
||||
if (exitEvent.getDiagnosticInfo() != null) {
|
||||
container.diagnostics.append(exitEvent.getDiagnosticInfo())
|
||||
.append('\n');
|
||||
@ -871,7 +874,7 @@ static class KillOnNewTransition extends ContainerDoneTransition {
|
||||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
ContainerKillEvent killEvent = (ContainerKillEvent) event;
|
||||
container.exitCode = ExitCode.TERMINATED.getExitCode();
|
||||
container.exitCode = killEvent.getContainerExitStatus();
|
||||
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
|
||||
container.diagnostics.append("Container is killed before being launched.\n");
|
||||
super.transition(container, event);
|
||||
@ -928,4 +931,9 @@ public String toString() {
|
||||
this.readLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasDefaultExitCode() {
|
||||
return (this.exitCode == ContainerExitStatus.INVALID);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -23,13 +23,21 @@
|
||||
public class ContainerKillEvent extends ContainerEvent {
|
||||
|
||||
private final String diagnostic;
|
||||
private final int exitStatus;
|
||||
|
||||
public ContainerKillEvent(ContainerId cID, String diagnostic) {
|
||||
public ContainerKillEvent(ContainerId cID,
|
||||
int exitStatus, String diagnostic) {
|
||||
super(cID, ContainerEventType.KILL_CONTAINER);
|
||||
this.exitStatus = exitStatus;
|
||||
this.diagnostic = diagnostic;
|
||||
}
|
||||
|
||||
public String getDiagnostic() {
|
||||
return this.diagnostic;
|
||||
}
|
||||
|
||||
public int getContainerExitStatus() {
|
||||
return this.exitStatus;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -30,6 +30,7 @@
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||
@ -403,6 +404,7 @@ public void run() {
|
||||
|
||||
boolean isMemoryOverLimit = false;
|
||||
String msg = "";
|
||||
int containerExitStatus = ContainerExitStatus.INVALID;
|
||||
if (isVmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
|
||||
@ -414,6 +416,7 @@ && isProcessTreeOverLimit(containerId.toString(),
|
||||
currentPmemUsage, pmemLimit,
|
||||
pId, containerId, pTree);
|
||||
isMemoryOverLimit = true;
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
|
||||
} else if (isPmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentPmemUsage, curRssMemUsageOfAgedProcesses,
|
||||
@ -426,6 +429,7 @@ && isProcessTreeOverLimit(containerId.toString(),
|
||||
currentPmemUsage, pmemLimit,
|
||||
pId, containerId, pTree);
|
||||
isMemoryOverLimit = true;
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||
}
|
||||
|
||||
if (isMemoryOverLimit) {
|
||||
@ -440,7 +444,8 @@ && isProcessTreeOverLimit(containerId.toString(),
|
||||
}
|
||||
// kill the container
|
||||
eventDispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(containerId, msg));
|
||||
new ContainerKillEvent(containerId,
|
||||
containerExitStatus, msg));
|
||||
it.remove();
|
||||
LOG.info("Removed ProcessTree with root " + pId);
|
||||
} else {
|
||||
|
@ -31,6 +31,7 @@
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.junit.Assert;
|
||||
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
@ -68,7 +69,6 @@
|
||||
import org.apache.hadoop.yarn.security.NMTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestAuxServices.ServiceA;
|
||||
@ -348,8 +348,7 @@ public void testContainerLaunchAndStop() throws IOException,
|
||||
GetContainerStatusesRequest.newInstance(containerIds);
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
|
||||
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode();
|
||||
int expectedExitCode = ContainerExitStatus.KILLED_BY_APPMASTER;
|
||||
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
|
||||
|
||||
// Assert that the process is not alive anymore
|
||||
|
@ -17,6 +17,7 @@
|
||||
*/
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.container;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
@ -319,7 +320,7 @@ public void testKillOnNew() throws Exception {
|
||||
assertEquals(ContainerState.NEW, wc.c.getContainerState());
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.DONE, wc.c.getContainerState());
|
||||
assertEquals(ExitCode.TERMINATED.getExitCode(),
|
||||
assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
wc.c.cloneAndGetContainerStatus().getExitStatus());
|
||||
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
||||
.contains("KillRequest"));
|
||||
@ -339,7 +340,7 @@ public void testKillOnLocalizing() throws Exception {
|
||||
assertEquals(ContainerState.LOCALIZING, wc.c.getContainerState());
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
|
||||
assertEquals(ExitCode.TERMINATED.getExitCode(),
|
||||
assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
wc.c.cloneAndGetContainerStatus().getExitStatus());
|
||||
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
||||
.contains("KillRequest"));
|
||||
@ -898,12 +899,14 @@ public void containerFailed(int exitCode) {
|
||||
}
|
||||
|
||||
public void killContainer() {
|
||||
c.handle(new ContainerKillEvent(cId, "KillRequest"));
|
||||
c.handle(new ContainerKillEvent(cId,
|
||||
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
"KillRequest"));
|
||||
drainDispatcherEvents();
|
||||
}
|
||||
|
||||
public void containerKilledOnRequest() {
|
||||
int exitCode = ExitCode.FORCE_KILLED.getExitCode();
|
||||
int exitCode = ContainerExitStatus.KILLED_BY_RESOURCEMANAGER;
|
||||
String diagnosticMsg = "Container completed with exit code " + exitCode;
|
||||
c.handle(new ContainerExitEvent(cId,
|
||||
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode,
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import static org.junit.Assert.fail;
|
||||
@ -73,7 +74,6 @@
|
||||
import org.apache.hadoop.yarn.event.Event;
|
||||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
@ -604,8 +604,7 @@ public void testContainerEnvVariables() throws Exception {
|
||||
GetContainerStatusesRequest.newInstance(containerIds);
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
|
||||
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode();
|
||||
int expectedExitCode = ContainerExitStatus.KILLED_BY_APPMASTER;
|
||||
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
|
||||
|
||||
// Assert that the process is not alive anymore
|
||||
@ -717,7 +716,7 @@ private void internalKillTest(boolean delayed) throws Exception {
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest)
|
||||
.getContainerStatuses().get(0);
|
||||
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
|
||||
Assert.assertEquals(ContainerExitStatus.KILLED_BY_APPMASTER,
|
||||
containerStatus.getExitStatus());
|
||||
|
||||
// Now verify the contents of the file. Script generates a message when it
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
@ -60,7 +61,6 @@
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||
@ -270,7 +270,7 @@ public void testContainerKillOnMemoryOverflow() throws IOException,
|
||||
GetContainerStatusesRequest.newInstance(containerIds);
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
|
||||
Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
|
||||
Assert.assertEquals(ContainerExitStatus.KILLED_EXCEEDED_VMEM,
|
||||
containerStatus.getExitStatus());
|
||||
String expectedMsgPattern =
|
||||
"Container \\[pid=" + pid + ",containerID=" + cId
|
||||
|
Loading…
Reference in New Issue
Block a user