YARN-11567 - Aggregate container launch debug artifacts on error (#6053)

This commit is contained in:
K0K0V0K 2023-09-22 15:09:17 +02:00 committed by GitHub
parent cc66683b1a
commit 0780710f25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 6 deletions

View File

@ -150,7 +150,11 @@ private static void addDeprecatedKeys() {
public static final String NM_LOG_CONTAINER_DEBUG_INFO =
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
public static final String NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR =
YarnConfiguration.NM_PREFIX + "log-container-debug-info-on-error.enabled";
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true;
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = false;
////////////////////////////////
// IPC Configs

View File

@ -1656,6 +1656,21 @@
<value>true</value>
</property>
<property>
<description>Generate additional logs about container launches,
if container returned with non-zero exit code.
Currently, this creates a copy of the launch script and lists the
directory contents of the container work dir. When listing directory
contents, we follow symlinks to a max-depth of 5(including symlinks
which point to outside the container work dir) which may lead to a
slowness in launching containers.
If yarn.nodemanager.log-container-debug-info.enabled is true,
it does not have effect on the behavior.
</description>
<name>yarn.nodemanager.log-container-debug-info-on-error.enabled</name>
<value>false</value>
</property>
<property>
<description>Amount of physical memory, in MB, that can be allocated
for containers. If set to -1 and

View File

@ -102,6 +102,7 @@ public abstract class ContainerExecutor implements Configurable {
private String[] whitelistVars;
private int exitCodeFileTimeout =
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
private int containerExitCode;
@Override
public void setConf(Configuration conf) {
@ -303,7 +304,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
if (pidPath == null) {
LOG.warn("{} is not active, returning terminated error", containerId);
containerExitCode = ExitCode.TERMINATED.getExitCode();
return ExitCode.TERMINATED.getExitCode();
}
@ -335,7 +336,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
while (!file.exists() && msecLeft >= 0) {
if (!isContainerActive(containerId)) {
LOG.info("{} was deactivated", containerId);
containerExitCode = ExitCode.TERMINATED.getExitCode();
return ExitCode.TERMINATED.getExitCode();
}
@ -350,7 +351,9 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
}
try {
return Integer.parseInt(FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
containerExitCode = Integer.parseInt(
FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
return containerExitCode;
} catch (NumberFormatException e) {
throw new IOException("Error parsing exit code from pid " + pid, e);
}
@ -453,9 +456,7 @@ public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
}
// dump debugging information if configured
if (getConf() != null &&
getConf().getBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
if (shouldWriteDebugInformation(getConf())) {
sb.echo("Copying debugging information");
sb.copyDebugInformation(new Path(outFilename),
new Path(logDir, outFilename));
@ -488,6 +489,18 @@ protected File[] readDirAsUser(String user, Path dir) {
return new File(dir.toString()).listFiles();
}
private boolean shouldWriteDebugInformation(Configuration config) {
return config != null && (
config.getBoolean(
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO
) || (
config.getBoolean(
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR,
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR
) && containerExitCode != 0));
}
/**
* The container exit code.
*/

View File

@ -1844,6 +1844,63 @@ public void testDebuggingInformation() throws IOException {
}
}
@Test
public void testDebuggingInformationOnError() throws IOException {
File shellFile = null;
File tempFile = null;
Configuration conf = new YarnConfiguration();
try {
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : "echo \"hello\"";
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
FileUtil.setExecutable(shellFile, true);
writer.println(testCommand);
writer.close();
Map<Path, List<String>> resources = new HashMap<>();
Map<String, String> env = new HashMap<>();
List<String> commands = new ArrayList<>();
if (Shell.WINDOWS) {
commands.add("cmd");
commands.add("/c");
commands.add("\"" + shellFile.getAbsolutePath() + "\"");
} else {
commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\"");
}
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, false);
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, true);
FileOutputStream fos = new FileOutputStream(tempFile);
ContainerExecutor exec = new DefaultContainerExecutor();
exec.setConf(conf);
LinkedHashSet<String> nmVars = new LinkedHashSet<>();
exec.writeLaunchEnv(fos, env, resources, commands,
new Path(localLogDir.getAbsolutePath()), "user",
tempFile.getName(), nmVars);
fos.flush();
fos.close();
FileUtil.setExecutable(tempFile, true);
Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor(
new String[]{tempFile.getAbsolutePath()}, tmpDir);
shexc.execute();
assertThat(shexc.getExitCode()).isZero();
File directorInfo =
new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS);
File scriptCopy = new File(localLogDir, tempFile.getName());
Assert.assertFalse("Directory info file missing",
directorInfo.exists());
Assert.assertFalse("Copy of launch script missing",
scriptCopy.exists());
} finally {
// cleanup
if (shellFile != null && shellFile.exists()) {
shellFile.delete();
}
if (tempFile != null && tempFile.exists()) {
tempFile.delete();
}
}
}
/**
* Test container launch fault.
* @throws Exception