YARN-11567 - Aggregate container launch debug artifacts on error (#6053)
This commit is contained in:
parent
cc66683b1a
commit
0780710f25
@ -150,7 +150,11 @@ private static void addDeprecatedKeys() {
|
||||
public static final String NM_LOG_CONTAINER_DEBUG_INFO =
|
||||
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
|
||||
|
||||
public static final String NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR =
|
||||
YarnConfiguration.NM_PREFIX + "log-container-debug-info-on-error.enabled";
|
||||
|
||||
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true;
|
||||
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = false;
|
||||
|
||||
////////////////////////////////
|
||||
// IPC Configs
|
||||
|
@ -1656,6 +1656,21 @@
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Generate additional logs about container launches,
|
||||
if container returned with non-zero exit code.
|
||||
Currently, this creates a copy of the launch script and lists the
|
||||
directory contents of the container work dir. When listing directory
|
||||
contents, we follow symlinks to a max-depth of 5(including symlinks
|
||||
which point to outside the container work dir) which may lead to a
|
||||
slowness in launching containers.
|
||||
If yarn.nodemanager.log-container-debug-info.enabled is true,
|
||||
it does not have effect on the behavior.
|
||||
</description>
|
||||
<name>yarn.nodemanager.log-container-debug-info-on-error.enabled</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Amount of physical memory, in MB, that can be allocated
|
||||
for containers. If set to -1 and
|
||||
|
@ -102,6 +102,7 @@ public abstract class ContainerExecutor implements Configurable {
|
||||
private String[] whitelistVars;
|
||||
private int exitCodeFileTimeout =
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
|
||||
private int containerExitCode;
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
@ -303,7 +304,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
|
||||
|
||||
if (pidPath == null) {
|
||||
LOG.warn("{} is not active, returning terminated error", containerId);
|
||||
|
||||
containerExitCode = ExitCode.TERMINATED.getExitCode();
|
||||
return ExitCode.TERMINATED.getExitCode();
|
||||
}
|
||||
|
||||
@ -335,7 +336,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
|
||||
while (!file.exists() && msecLeft >= 0) {
|
||||
if (!isContainerActive(containerId)) {
|
||||
LOG.info("{} was deactivated", containerId);
|
||||
|
||||
containerExitCode = ExitCode.TERMINATED.getExitCode();
|
||||
return ExitCode.TERMINATED.getExitCode();
|
||||
}
|
||||
|
||||
@ -350,7 +351,9 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
|
||||
}
|
||||
|
||||
try {
|
||||
return Integer.parseInt(FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
|
||||
containerExitCode = Integer.parseInt(
|
||||
FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
|
||||
return containerExitCode;
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IOException("Error parsing exit code from pid " + pid, e);
|
||||
}
|
||||
@ -453,9 +456,7 @@ public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
|
||||
}
|
||||
|
||||
// dump debugging information if configured
|
||||
if (getConf() != null &&
|
||||
getConf().getBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
|
||||
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
|
||||
if (shouldWriteDebugInformation(getConf())) {
|
||||
sb.echo("Copying debugging information");
|
||||
sb.copyDebugInformation(new Path(outFilename),
|
||||
new Path(logDir, outFilename));
|
||||
@ -488,6 +489,18 @@ protected File[] readDirAsUser(String user, Path dir) {
|
||||
return new File(dir.toString()).listFiles();
|
||||
}
|
||||
|
||||
private boolean shouldWriteDebugInformation(Configuration config) {
|
||||
return config != null && (
|
||||
config.getBoolean(
|
||||
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
|
||||
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO
|
||||
) || (
|
||||
config.getBoolean(
|
||||
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR,
|
||||
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR
|
||||
) && containerExitCode != 0));
|
||||
}
|
||||
|
||||
/**
|
||||
* The container exit code.
|
||||
*/
|
||||
|
@ -1844,6 +1844,63 @@ public void testDebuggingInformation() throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDebuggingInformationOnError() throws IOException {
|
||||
File shellFile = null;
|
||||
File tempFile = null;
|
||||
Configuration conf = new YarnConfiguration();
|
||||
try {
|
||||
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
|
||||
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
|
||||
String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : "echo \"hello\"";
|
||||
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||
FileUtil.setExecutable(shellFile, true);
|
||||
writer.println(testCommand);
|
||||
writer.close();
|
||||
Map<Path, List<String>> resources = new HashMap<>();
|
||||
Map<String, String> env = new HashMap<>();
|
||||
List<String> commands = new ArrayList<>();
|
||||
if (Shell.WINDOWS) {
|
||||
commands.add("cmd");
|
||||
commands.add("/c");
|
||||
commands.add("\"" + shellFile.getAbsolutePath() + "\"");
|
||||
} else {
|
||||
commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\"");
|
||||
}
|
||||
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, false);
|
||||
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, true);
|
||||
FileOutputStream fos = new FileOutputStream(tempFile);
|
||||
ContainerExecutor exec = new DefaultContainerExecutor();
|
||||
exec.setConf(conf);
|
||||
LinkedHashSet<String> nmVars = new LinkedHashSet<>();
|
||||
exec.writeLaunchEnv(fos, env, resources, commands,
|
||||
new Path(localLogDir.getAbsolutePath()), "user",
|
||||
tempFile.getName(), nmVars);
|
||||
fos.flush();
|
||||
fos.close();
|
||||
FileUtil.setExecutable(tempFile, true);
|
||||
Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor(
|
||||
new String[]{tempFile.getAbsolutePath()}, tmpDir);
|
||||
shexc.execute();
|
||||
assertThat(shexc.getExitCode()).isZero();
|
||||
File directorInfo =
|
||||
new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS);
|
||||
File scriptCopy = new File(localLogDir, tempFile.getName());
|
||||
Assert.assertFalse("Directory info file missing",
|
||||
directorInfo.exists());
|
||||
Assert.assertFalse("Copy of launch script missing",
|
||||
scriptCopy.exists());
|
||||
} finally {
|
||||
// cleanup
|
||||
if (shellFile != null && shellFile.exists()) {
|
||||
shellFile.delete();
|
||||
}
|
||||
if (tempFile != null && tempFile.exists()) {
|
||||
tempFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test container launch fault.
|
||||
* @throws Exception
|
||||
|
Loading…
Reference in New Issue
Block a user