YARN-11567 - Aggregate container launch debug artifacts on error (#6053)
This commit is contained in:
parent
cc66683b1a
commit
0780710f25
@ -150,7 +150,11 @@ private static void addDeprecatedKeys() {
|
|||||||
public static final String NM_LOG_CONTAINER_DEBUG_INFO =
|
public static final String NM_LOG_CONTAINER_DEBUG_INFO =
|
||||||
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
|
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
|
||||||
|
|
||||||
|
public static final String NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR =
|
||||||
|
YarnConfiguration.NM_PREFIX + "log-container-debug-info-on-error.enabled";
|
||||||
|
|
||||||
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true;
|
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true;
|
||||||
|
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = false;
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// IPC Configs
|
// IPC Configs
|
||||||
|
@ -1656,6 +1656,21 @@
|
|||||||
<value>true</value>
|
<value>true</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Generate additional logs about container launches,
|
||||||
|
if container returned with non-zero exit code.
|
||||||
|
Currently, this creates a copy of the launch script and lists the
|
||||||
|
directory contents of the container work dir. When listing directory
|
||||||
|
contents, we follow symlinks to a max-depth of 5(including symlinks
|
||||||
|
which point to outside the container work dir) which may lead to a
|
||||||
|
slowness in launching containers.
|
||||||
|
If yarn.nodemanager.log-container-debug-info.enabled is true,
|
||||||
|
it does not have effect on the behavior.
|
||||||
|
</description>
|
||||||
|
<name>yarn.nodemanager.log-container-debug-info-on-error.enabled</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>Amount of physical memory, in MB, that can be allocated
|
<description>Amount of physical memory, in MB, that can be allocated
|
||||||
for containers. If set to -1 and
|
for containers. If set to -1 and
|
||||||
|
@ -102,6 +102,7 @@ public abstract class ContainerExecutor implements Configurable {
|
|||||||
private String[] whitelistVars;
|
private String[] whitelistVars;
|
||||||
private int exitCodeFileTimeout =
|
private int exitCodeFileTimeout =
|
||||||
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
|
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
|
||||||
|
private int containerExitCode;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setConf(Configuration conf) {
|
public void setConf(Configuration conf) {
|
||||||
@ -303,7 +304,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
|
|||||||
|
|
||||||
if (pidPath == null) {
|
if (pidPath == null) {
|
||||||
LOG.warn("{} is not active, returning terminated error", containerId);
|
LOG.warn("{} is not active, returning terminated error", containerId);
|
||||||
|
containerExitCode = ExitCode.TERMINATED.getExitCode();
|
||||||
return ExitCode.TERMINATED.getExitCode();
|
return ExitCode.TERMINATED.getExitCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -335,7 +336,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
|
|||||||
while (!file.exists() && msecLeft >= 0) {
|
while (!file.exists() && msecLeft >= 0) {
|
||||||
if (!isContainerActive(containerId)) {
|
if (!isContainerActive(containerId)) {
|
||||||
LOG.info("{} was deactivated", containerId);
|
LOG.info("{} was deactivated", containerId);
|
||||||
|
containerExitCode = ExitCode.TERMINATED.getExitCode();
|
||||||
return ExitCode.TERMINATED.getExitCode();
|
return ExitCode.TERMINATED.getExitCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -350,7 +351,9 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return Integer.parseInt(FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
|
containerExitCode = Integer.parseInt(
|
||||||
|
FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
|
||||||
|
return containerExitCode;
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
throw new IOException("Error parsing exit code from pid " + pid, e);
|
throw new IOException("Error parsing exit code from pid " + pid, e);
|
||||||
}
|
}
|
||||||
@ -453,9 +456,7 @@ public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// dump debugging information if configured
|
// dump debugging information if configured
|
||||||
if (getConf() != null &&
|
if (shouldWriteDebugInformation(getConf())) {
|
||||||
getConf().getBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
|
|
||||||
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
|
|
||||||
sb.echo("Copying debugging information");
|
sb.echo("Copying debugging information");
|
||||||
sb.copyDebugInformation(new Path(outFilename),
|
sb.copyDebugInformation(new Path(outFilename),
|
||||||
new Path(logDir, outFilename));
|
new Path(logDir, outFilename));
|
||||||
@ -488,6 +489,18 @@ protected File[] readDirAsUser(String user, Path dir) {
|
|||||||
return new File(dir.toString()).listFiles();
|
return new File(dir.toString()).listFiles();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean shouldWriteDebugInformation(Configuration config) {
|
||||||
|
return config != null && (
|
||||||
|
config.getBoolean(
|
||||||
|
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
|
||||||
|
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO
|
||||||
|
) || (
|
||||||
|
config.getBoolean(
|
||||||
|
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR,
|
||||||
|
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR
|
||||||
|
) && containerExitCode != 0));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The container exit code.
|
* The container exit code.
|
||||||
*/
|
*/
|
||||||
|
@ -1844,6 +1844,63 @@ public void testDebuggingInformation() throws IOException {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDebuggingInformationOnError() throws IOException {
|
||||||
|
File shellFile = null;
|
||||||
|
File tempFile = null;
|
||||||
|
Configuration conf = new YarnConfiguration();
|
||||||
|
try {
|
||||||
|
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
|
||||||
|
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
|
||||||
|
String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : "echo \"hello\"";
|
||||||
|
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||||
|
FileUtil.setExecutable(shellFile, true);
|
||||||
|
writer.println(testCommand);
|
||||||
|
writer.close();
|
||||||
|
Map<Path, List<String>> resources = new HashMap<>();
|
||||||
|
Map<String, String> env = new HashMap<>();
|
||||||
|
List<String> commands = new ArrayList<>();
|
||||||
|
if (Shell.WINDOWS) {
|
||||||
|
commands.add("cmd");
|
||||||
|
commands.add("/c");
|
||||||
|
commands.add("\"" + shellFile.getAbsolutePath() + "\"");
|
||||||
|
} else {
|
||||||
|
commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\"");
|
||||||
|
}
|
||||||
|
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, false);
|
||||||
|
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, true);
|
||||||
|
FileOutputStream fos = new FileOutputStream(tempFile);
|
||||||
|
ContainerExecutor exec = new DefaultContainerExecutor();
|
||||||
|
exec.setConf(conf);
|
||||||
|
LinkedHashSet<String> nmVars = new LinkedHashSet<>();
|
||||||
|
exec.writeLaunchEnv(fos, env, resources, commands,
|
||||||
|
new Path(localLogDir.getAbsolutePath()), "user",
|
||||||
|
tempFile.getName(), nmVars);
|
||||||
|
fos.flush();
|
||||||
|
fos.close();
|
||||||
|
FileUtil.setExecutable(tempFile, true);
|
||||||
|
Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor(
|
||||||
|
new String[]{tempFile.getAbsolutePath()}, tmpDir);
|
||||||
|
shexc.execute();
|
||||||
|
assertThat(shexc.getExitCode()).isZero();
|
||||||
|
File directorInfo =
|
||||||
|
new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS);
|
||||||
|
File scriptCopy = new File(localLogDir, tempFile.getName());
|
||||||
|
Assert.assertFalse("Directory info file missing",
|
||||||
|
directorInfo.exists());
|
||||||
|
Assert.assertFalse("Copy of launch script missing",
|
||||||
|
scriptCopy.exists());
|
||||||
|
} finally {
|
||||||
|
// cleanup
|
||||||
|
if (shellFile != null && shellFile.exists()) {
|
||||||
|
shellFile.delete();
|
||||||
|
}
|
||||||
|
if (tempFile != null && tempFile.exists()) {
|
||||||
|
tempFile.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test container launch fault.
|
* Test container launch fault.
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
|
Loading…
Reference in New Issue
Block a user