YARN-814. Improving diagnostics when containers fail during launch due to various reasons like invalid env etc. Contributed by Jian He.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1504732 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2013-07-19 00:28:18 +00:00
parent 8724ceb235
commit 7a29bccd7a
6 changed files with 231 additions and 32 deletions

View File

@ -74,6 +74,9 @@ Release 2.1.1-beta - UNRELEASED
YARN-62. Modified NodeManagers to avoid AMs from abusing container tokens for YARN-62. Modified NodeManagers to avoid AMs from abusing container tokens for
repetitive container launches. (Omkar Vinit Joshi via vinodkv) repetitive container launches. (Omkar Vinit Joshi via vinodkv)
YARN-814. Improving diagnostics when containers fail during launch due to
various reasons like invalid env etc. (Jian He via vinodkv)
Release 2.1.0-beta - 2013-07-02 Release 2.1.0-beta - 2013-07-02
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -39,8 +39,10 @@
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader; import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
public abstract class ContainerExecutor implements Configurable { public abstract class ContainerExecutor implements Configurable {
@ -291,15 +293,16 @@ public String getProcessId(ContainerId containerID) {
} }
public static class DelayedProcessKiller extends Thread { public static class DelayedProcessKiller extends Thread {
private Container container;
private final String user; private final String user;
private final String pid; private final String pid;
private final long delay; private final long delay;
private final Signal signal; private final Signal signal;
private final ContainerExecutor containerExecutor; private final ContainerExecutor containerExecutor;
public DelayedProcessKiller(String user, String pid, long delay, public DelayedProcessKiller(Container container, String user, String pid,
Signal signal, long delay, Signal signal, ContainerExecutor containerExecutor) {
ContainerExecutor containerExecutor) { this.container = container;
this.user = user; this.user = user;
this.pid = pid; this.pid = pid;
this.delay = delay; this.delay = delay;
@ -316,7 +319,11 @@ public void run() {
} catch (InterruptedException e) { } catch (InterruptedException e) {
return; return;
} catch (IOException e) { } catch (IOException e) {
LOG.warn("Exception when killing task " + pid, e); String message = "Exception when user " + user + " killing task " + pid
+ " in DelayedProcessKiller: " + StringUtils.stringifyException(e);
LOG.warn(message);
container.handle(new ContainerDiagnosticsUpdateEvent(container
.getContainerId(), message));
} }
} }
} }

View File

@ -41,6 +41,7 @@
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.Shell.ExitCodeException; import org.apache.hadoop.util.Shell.ExitCodeException;
import org.apache.hadoop.util.Shell.ShellCommandExecutor; import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
@ -203,11 +204,23 @@ public int launchContainer(Container container,
return -1; return -1;
} }
int exitCode = shExec.getExitCode(); int exitCode = shExec.getExitCode();
LOG.warn("Exit code from task is : " + exitCode); LOG.warn("Exit code from container " + containerId + " is : " + exitCode);
String message = shExec.getOutput(); // 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
logOutput(message); // terminated/killed forcefully. In all other cases, log the
// container-executor's output
if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
&& exitCode != ExitCode.TERMINATED.getExitCode()) {
LOG.warn("Exception from container-launch with container ID: "
+ containerId + " and exit code: " + exitCode , e);
logOutput(shExec.getOutput());
String diagnostics = "Exception from container-launch: \n"
+ StringUtils.stringifyException(e) + "\n" + shExec.getOutput();
container.handle(new ContainerDiagnosticsUpdateEvent(containerId, container.handle(new ContainerDiagnosticsUpdateEvent(containerId,
message)); diagnostics));
} else {
container.handle(new ContainerDiagnosticsUpdateEvent(containerId,
"Container killed on request. Exit code is " + exitCode));
}
return exitCode; return exitCode;
} finally { } finally {
; // ; //

View File

@ -146,7 +146,8 @@ public void init() throws IOException {
shExec.execute(); shExec.execute();
} catch (ExitCodeException e) { } catch (ExitCodeException e) {
int exitCode = shExec.getExitCode(); int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container is : " + exitCode); LOG.warn("Exit code from container executor initialization is : "
+ exitCode, e);
logOutput(shExec.getOutput()); logOutput(shExec.getOutput());
throw new IOException("Linux container executor not configured properly" throw new IOException("Linux container executor not configured properly"
+ " (error=" + exitCode + ")", e); + " (error=" + exitCode + ")", e);
@ -203,10 +204,11 @@ public void startLocalizer(Path nmPrivateContainerTokensPath,
} }
} catch (ExitCodeException e) { } catch (ExitCodeException e) {
int exitCode = shExec.getExitCode(); int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container is : " + exitCode); LOG.warn("Exit code from container " + locId + " startLocalizer is : "
+ exitCode, e);
logOutput(shExec.getOutput()); logOutput(shExec.getOutput());
throw new IOException("App initialization failed (" + exitCode + throw new IOException("Application " + appId + " initialization failed" +
") with output: " + shExec.getOutput(), e); " (exitCode=" + exitCode + ") with output: " + shExec.getOutput(), e);
} }
} }
@ -255,19 +257,18 @@ public int launchContainer(Container container,
return ExitCode.TERMINATED.getExitCode(); return ExitCode.TERMINATED.getExitCode();
} }
} catch (ExitCodeException e) { } catch (ExitCodeException e) {
if (null == shExec) { if (null == shExec) {
return -1; return -1;
} }
int exitCode = shExec.getExitCode(); int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container is : " + exitCode); LOG.warn("Exit code from container " + containerId + " is : " + exitCode);
// 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was // 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
// terminated/killed forcefully. In all other cases, log the // terminated/killed forcefully. In all other cases, log the
// container-executor's output // container-executor's output
if (exitCode != ExitCode.FORCE_KILLED.getExitCode() if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
&& exitCode != ExitCode.TERMINATED.getExitCode()) { && exitCode != ExitCode.TERMINATED.getExitCode()) {
LOG.warn("Exception from container-launch : ", e); LOG.warn("Exception from container-launch with container ID: "
+ containerId + " and exit code: " + exitCode , e);
logOutput(shExec.getOutput()); logOutput(shExec.getOutput());
String diagnostics = "Exception from container-launch: \n" String diagnostics = "Exception from container-launch: \n"
+ StringUtils.stringifyException(e) + "\n" + shExec.getOutput(); + StringUtils.stringifyException(e) + "\n" + shExec.getOutput();
@ -309,9 +310,12 @@ public boolean signalContainer(String user, String pid, Signal signal)
if (ret_code == ResultCode.INVALID_CONTAINER_PID.getValue()) { if (ret_code == ResultCode.INVALID_CONTAINER_PID.getValue()) {
return false; return false;
} }
LOG.warn("Error in signalling container " + pid + " with " + signal
+ "; exit = " + ret_code, e);
logOutput(shExec.getOutput()); logOutput(shExec.getOutput());
throw new IOException("Problem signalling container " + pid + " with " + throw new IOException("Problem signalling container " + pid + " with "
signal + "; exit = " + ret_code); + signal + "; output: " + shExec.getOutput() + " and exitCode: "
+ ret_code, e);
} }
return true; return true;
} }
@ -345,15 +349,12 @@ public void deleteAsUser(String user, Path dir, Path... baseDirs) {
} }
} catch (IOException e) { } catch (IOException e) {
int exitCode = shExec.getExitCode(); int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container is : " + exitCode);
if (exitCode != 0) {
LOG.error("DeleteAsUser for " + dir.toUri().getPath() LOG.error("DeleteAsUser for " + dir.toUri().getPath()
+ " returned with non-zero exit code" + exitCode); + " returned with exit code: " + exitCode, e);
LOG.error("Output from LinuxContainerExecutor's deleteAsUser follows:"); LOG.error("Output from LinuxContainerExecutor's deleteAsUser follows:");
logOutput(shExec.getOutput()); logOutput(shExec.getOutput());
} }
} }
}
public void mountCgroups(List<String> cgroupKVs, String hierarchy) public void mountCgroups(List<String> cgroupKVs, String hierarchy)
throws IOException { throws IOException {
@ -372,9 +373,10 @@ public void mountCgroups(List<String> cgroupKVs, String hierarchy)
shExec.execute(); shExec.execute();
} catch (IOException e) { } catch (IOException e) {
int ret_code = shExec.getExitCode(); int ret_code = shExec.getExitCode();
LOG.warn("Exception in LinuxContainerExecutor mountCgroups ", e);
logOutput(shExec.getOutput()); logOutput(shExec.getOutput());
throw new IOException("Problem mounting cgroups " + cgroupKVs + throw new IOException("Problem mounting cgroups " + cgroupKVs +
"; exit code = " + ret_code, e); "; exit code = " + ret_code + " and output: " + shExec.getOutput(), e);
} }
} }
} }

View File

@ -61,6 +61,7 @@
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
@ -308,6 +309,7 @@ public Integer call() {
* the process id is available. * the process id is available.
* @throws IOException * @throws IOException
*/ */
@SuppressWarnings("unchecked") // dispatcher not typed
public void cleanupContainer() throws IOException { public void cleanupContainer() throws IOException {
ContainerId containerId = container.getContainerId(); ContainerId containerId = container.getContainerId();
String containerIdStr = ConverterUtils.toString(containerId); String containerIdStr = ConverterUtils.toString(containerId);
@ -355,13 +357,17 @@ public void cleanupContainer() throws IOException {
+ " as user " + user + " as user " + user
+ " for container " + containerIdStr + " for container " + containerIdStr
+ ", result=" + (result? "success" : "failed")); + ", result=" + (result? "success" : "failed"));
new DelayedProcessKiller(user, new DelayedProcessKiller(container, user,
processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start(); processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
} }
} }
} catch (Exception e) { } catch (Exception e) {
LOG.warn("Got error when trying to cleanup container " + containerIdStr String message =
+ ", error=" + e.getMessage()); "Exception when trying to cleanup container " + containerIdStr
+ ": " + StringUtils.stringifyException(e);
LOG.warn(message);
dispatcher.getEventHandler().handle(
new ContainerDiagnosticsUpdateEvent(containerId, message));
} finally { } finally {
// cleanup pid file if present // cleanup pid file if present
if (pidFilePath != null) { if (pidFilePath != null) {

View File

@ -39,6 +39,7 @@
import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.security.token.SecretManager.InvalidToken; import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.Shell.ExitCodeException;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest; import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest;
@ -151,6 +152,173 @@ public void testSpecialCharSymlinks() throws IOException {
} }
} }
// test the diagnostics are generated
@Test (timeout = 20000)
public void testInvalidSymlinkDiagnostics() throws IOException {
File shellFile = null;
File tempFile = null;
String symLink = Shell.WINDOWS ? "test.cmd" :
"test";
File symLinkFile = null;
try {
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
String timeoutCommand = Shell.WINDOWS ? "@echo \"hello\"" :
"echo \"hello\"";
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
FileUtil.setExecutable(shellFile, true);
writer.println(timeoutCommand);
writer.close();
Map<Path, List<String>> resources =
new HashMap<Path, List<String>>();
//This is an invalid path and should throw exception because of No such file.
Path invalidPath = new Path(shellFile.getAbsolutePath()+"randomPath");
resources.put(invalidPath, Arrays.asList(symLink));
FileOutputStream fos = new FileOutputStream(tempFile);
Map<String, String> env = new HashMap<String, String>();
List<String> commands = new ArrayList<String>();
if (Shell.WINDOWS) {
commands.add("cmd");
commands.add("/c");
commands.add("\"" + symLink + "\"");
} else {
commands.add("/bin/sh ./\\\"" + symLink + "\\\"");
}
ContainerLaunch.writeLaunchEnv(fos, env, resources, commands);
fos.flush();
fos.close();
FileUtil.setExecutable(tempFile, true);
Shell.ShellCommandExecutor shexc
= new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, tmpDir);
String diagnostics = null;
try {
shexc.execute();
Assert.fail("Should catch exception");
} catch(ExitCodeException e){
diagnostics = e.getMessage();
}
Assert.assertNotNull(diagnostics);
Assert.assertTrue(shexc.getExitCode() != 0);
symLinkFile = new File(tmpDir, symLink);
}
finally {
// cleanup
if (shellFile != null
&& shellFile.exists()) {
shellFile.delete();
}
if (tempFile != null
&& tempFile.exists()) {
tempFile.delete();
}
if (symLinkFile != null
&& symLinkFile.exists()) {
symLinkFile.delete();
}
}
}
@Test (timeout = 20000)
public void testInvalidEnvSyntaxDiagnostics() throws IOException {
File shellFile = null;
try {
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
String timeoutCommand = Shell.WINDOWS ? "@echo \"hello\"" :
"echo \"hello\"";
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
FileUtil.setExecutable(shellFile, true);
writer.println(timeoutCommand);
writer.close();
Map<Path, List<String>> resources =
new HashMap<Path, List<String>>();
FileOutputStream fos = new FileOutputStream(shellFile);
Map<String, String> env = new HashMap<String, String>();
// invalid env
env.put(
"APPLICATION_WORKFLOW_CONTEXT", "{\"workflowId\":\"609f91c5cd83\"," +
"\"workflowName\":\"\n\ninsert table " +
"\npartition (cd_education_status)\nselect cd_demo_sk, cd_gender, " );
List<String> commands = new ArrayList<String>();
ContainerLaunch.writeLaunchEnv(fos, env, resources, commands);
fos.flush();
fos.close();
Shell.ShellCommandExecutor shexc
= new Shell.ShellCommandExecutor(new String[]{shellFile.getAbsolutePath()}, tmpDir);
String diagnostics = null;
try {
shexc.execute();
Assert.fail("Should catch exception");
} catch(ExitCodeException e){
diagnostics = e.getMessage();
}
Assert.assertTrue(diagnostics.contains("command not found"));
Assert.assertTrue(shexc.getExitCode() != 0);
}
finally {
// cleanup
if (shellFile != null
&& shellFile.exists()) {
shellFile.delete();
}
}
}
@Test (timeout = 20000)
public void testContainerLaunchStdoutAndStderrDiagnostics() throws IOException {
File shellFile = null;
try {
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
// echo "hello" to stdout and "error" to stderr and exit code with 2;
String command = Shell.WINDOWS ? "@echo \"hello\"; @echo \"error\" 1>&2; exit 2;" :
"echo \"hello\"; echo \"error\" 1>&2; exit 2;";
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
FileUtil.setExecutable(shellFile, true);
writer.println(command);
writer.close();
Map<Path, List<String>> resources =
new HashMap<Path, List<String>>();
FileOutputStream fos = new FileOutputStream(shellFile);
Map<String, String> env = new HashMap<String, String>();
List<String> commands = new ArrayList<String>();
commands.add(command);
ContainerLaunch.writeLaunchEnv(fos, env, resources, commands);
fos.flush();
fos.close();
Shell.ShellCommandExecutor shexc
= new Shell.ShellCommandExecutor(new String[]{shellFile.getAbsolutePath()}, tmpDir);
String diagnostics = null;
try {
shexc.execute();
Assert.fail("Should catch exception");
} catch(ExitCodeException e){
diagnostics = e.getMessage();
}
// test stderr
Assert.assertTrue(diagnostics.contains("error"));
// test stdout
Assert.assertTrue(shexc.getOutput().contains("hello"));
Assert.assertTrue(shexc.getExitCode() == 2);
}
finally {
// cleanup
if (shellFile != null
&& shellFile.exists()) {
shellFile.delete();
}
}
}
/** /**
* See if environment variable is forwarded using sanitizeEnv. * See if environment variable is forwarded using sanitizeEnv.
* @throws Exception * @throws Exception