YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du
This commit is contained in:
parent
177e8090f5
commit
33ea5ae92b
hadoop-yarn-project
CHANGES.txt
hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager
@ -955,6 +955,10 @@ Release 2.6.0 - 2014-11-15
|
|||||||
YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
|
YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
|
||||||
zjshen)
|
zjshen)
|
||||||
|
|
||||||
|
YARN-2846. Incorrect persist exit code for running containers in
|
||||||
|
reacquireContainer() that interrupted by NodeManager restart. (Junping Du
|
||||||
|
via jlowe)
|
||||||
|
|
||||||
Release 2.5.2 - 2014-11-10
|
Release 2.5.2 - 2014-11-10
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
@ -159,9 +159,10 @@ public abstract class ContainerExecutor implements Configurable {
|
|||||||
* @param containerId The ID of the container to reacquire
|
* @param containerId The ID of the container to reacquire
|
||||||
* @return The exit code of the pre-existing container
|
* @return The exit code of the pre-existing container
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
* @throws InterruptedException
|
||||||
*/
|
*/
|
||||||
public int reacquireContainer(String user, ContainerId containerId)
|
public int reacquireContainer(String user, ContainerId containerId)
|
||||||
throws IOException {
|
throws IOException, InterruptedException {
|
||||||
Path pidPath = getPidFilePath(containerId);
|
Path pidPath = getPidFilePath(containerId);
|
||||||
if (pidPath == null) {
|
if (pidPath == null) {
|
||||||
LOG.warn(containerId + " is not active, returning terminated error");
|
LOG.warn(containerId + " is not active, returning terminated error");
|
||||||
@ -175,13 +176,8 @@ public abstract class ContainerExecutor implements Configurable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOG.info("Reacquiring " + containerId + " with pid " + pid);
|
LOG.info("Reacquiring " + containerId + " with pid " + pid);
|
||||||
try {
|
while(isContainerProcessAlive(user, pid)) {
|
||||||
while(isContainerProcessAlive(user, pid)) {
|
Thread.sleep(1000);
|
||||||
Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new IOException("Interrupted while waiting for process " + pid
|
|
||||||
+ " to exit", e);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for exit code file to appear
|
// wait for exit code file to appear
|
||||||
@ -194,12 +190,9 @@ public abstract class ContainerExecutor implements Configurable {
|
|||||||
LOG.info(containerId + " was deactivated");
|
LOG.info(containerId + " was deactivated");
|
||||||
return ExitCode.TERMINATED.getExitCode();
|
return ExitCode.TERMINATED.getExitCode();
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
Thread.sleep(sleepMsec);
|
Thread.sleep(sleepMsec);
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new IOException(
|
|
||||||
"Interrupted while waiting for exit code from " + containerId, e);
|
|
||||||
}
|
|
||||||
msecLeft -= sleepMsec;
|
msecLeft -= sleepMsec;
|
||||||
}
|
}
|
||||||
if (msecLeft < 0) {
|
if (msecLeft < 0) {
|
||||||
|
@ -347,7 +347,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int reacquireContainer(String user, ContainerId containerId)
|
public int reacquireContainer(String user, ContainerId containerId)
|
||||||
throws IOException {
|
throws IOException, InterruptedException {
|
||||||
try {
|
try {
|
||||||
return super.reacquireContainer(user, containerId);
|
return super.reacquireContainer(user, containerId);
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -73,6 +73,7 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
|
|||||||
dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
|
dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
|
||||||
ContainerEventType.CONTAINER_LAUNCHED));
|
ContainerEventType.CONTAINER_LAUNCHED));
|
||||||
|
|
||||||
|
boolean notInterrupted = true;
|
||||||
try {
|
try {
|
||||||
File pidFile = locatePidFile(appIdStr, containerIdStr);
|
File pidFile = locatePidFile(appIdStr, containerIdStr);
|
||||||
if (pidFile != null) {
|
if (pidFile != null) {
|
||||||
@ -85,14 +86,19 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
|
|||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.error("Unable to recover container " + containerIdStr, e);
|
LOG.error("Unable to recover container " + containerIdStr, e);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
LOG.warn("Interrupted while waiting for exit code from " + containerId);
|
||||||
|
notInterrupted = false;
|
||||||
} finally {
|
} finally {
|
||||||
this.completed.set(true);
|
if (notInterrupted) {
|
||||||
exec.deactivateContainer(containerId);
|
this.completed.set(true);
|
||||||
try {
|
exec.deactivateContainer(containerId);
|
||||||
getContext().getNMStateStore().storeContainerCompleted(containerId,
|
try {
|
||||||
retCode);
|
getContext().getNMStateStore().storeContainerCompleted(containerId,
|
||||||
} catch (IOException e) {
|
retCode);
|
||||||
LOG.error("Unable to set exit code for container " + containerId);
|
} catch (IOException e) {
|
||||||
|
LOG.error("Unable to set exit code for container " + containerId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user