From 0780710f25a36f4471942edfe7a7f396cacb226d Mon Sep 17 00:00:00 2001 From: K0K0V0K <109747532+K0K0V0K@users.noreply.github.com> Date: Fri, 22 Sep 2023 15:09:17 +0200 Subject: [PATCH] YARN-11567 - Aggregate container launch debug artifacts on error (#6053) --- .../hadoop/yarn/conf/YarnConfiguration.java | 4 ++ .../src/main/resources/yarn-default.xml | 15 +++++ .../server/nodemanager/ContainerExecutor.java | 25 ++++++-- .../launcher/TestContainerLaunch.java | 57 +++++++++++++++++++ 4 files changed, 95 insertions(+), 6 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index ef06299fcf..bbb1ed6f8a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -150,7 +150,11 @@ private static void addDeprecatedKeys() { public static final String NM_LOG_CONTAINER_DEBUG_INFO = YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled"; + public static final String NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = + YarnConfiguration.NM_PREFIX + "log-container-debug-info-on-error.enabled"; + public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true; + public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = false; //////////////////////////////// // IPC Configs diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 9697f7aa88..9fa600db4b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1656,6 +1656,21 @@ true + + Generate additional logs about container launches, + if container returned with non-zero exit code. + Currently, this creates a copy of the launch script and lists the + directory contents of the container work dir. When listing directory + contents, we follow symlinks to a max-depth of 5(including symlinks + which point to outside the container work dir) which may lead to a + slowness in launching containers. + If yarn.nodemanager.log-container-debug-info.enabled is true, + it does not have effect on the behavior. + + yarn.nodemanager.log-container-debug-info-on-error.enabled + false + + Amount of physical memory, in MB, that can be allocated for containers. If set to -1 and diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java index 65e8183f69..3d0dca622c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java @@ -102,6 +102,7 @@ public abstract class ContainerExecutor implements Configurable { private String[] whitelistVars; private int exitCodeFileTimeout = YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT; + private int containerExitCode; @Override public void setConf(Configuration conf) { @@ -303,7 +304,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx) if (pidPath == null) { LOG.warn("{} is not active, returning terminated error", containerId); - + containerExitCode = ExitCode.TERMINATED.getExitCode(); return ExitCode.TERMINATED.getExitCode(); } @@ -335,7 +336,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx) while (!file.exists() && msecLeft >= 0) { if (!isContainerActive(containerId)) { LOG.info("{} was deactivated", containerId); - + containerExitCode = ExitCode.TERMINATED.getExitCode(); return ExitCode.TERMINATED.getExitCode(); } @@ -350,7 +351,9 @@ public int reacquireContainer(ContainerReacquisitionContext ctx) } try { - return Integer.parseInt(FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim()); + containerExitCode = Integer.parseInt( + FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim()); + return containerExitCode; } catch (NumberFormatException e) { throw new IOException("Error parsing exit code from pid " + pid, e); } @@ -453,9 +456,7 @@ public void writeLaunchEnv(OutputStream out, Map environment, } // dump debugging information if configured - if (getConf() != null && - getConf().getBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, - YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) { + if (shouldWriteDebugInformation(getConf())) { sb.echo("Copying debugging information"); sb.copyDebugInformation(new Path(outFilename), new Path(logDir, outFilename)); @@ -488,6 +489,18 @@ protected File[] readDirAsUser(String user, Path dir) { return new File(dir.toString()).listFiles(); } + private boolean shouldWriteDebugInformation(Configuration config) { + return config != null && ( + config.getBoolean( + YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, + YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO + ) || ( + config.getBoolean( + YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, + YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR + ) && containerExitCode != 0)); + } + /** * The container exit code. */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java index bd135ff519..6971d34b9d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java @@ -1844,6 +1844,63 @@ public void testDebuggingInformation() throws IOException { } } + @Test + public void testDebuggingInformationOnError() throws IOException { + File shellFile = null; + File tempFile = null; + Configuration conf = new YarnConfiguration(); + try { + shellFile = Shell.appendScriptExtension(tmpDir, "hello"); + tempFile = Shell.appendScriptExtension(tmpDir, "temp"); + String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : "echo \"hello\""; + PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile)); + FileUtil.setExecutable(shellFile, true); + writer.println(testCommand); + writer.close(); + Map> resources = new HashMap<>(); + Map env = new HashMap<>(); + List commands = new ArrayList<>(); + if (Shell.WINDOWS) { + commands.add("cmd"); + commands.add("/c"); + commands.add("\"" + shellFile.getAbsolutePath() + "\""); + } else { + commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\""); + } + conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, false); + conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, true); + FileOutputStream fos = new FileOutputStream(tempFile); + ContainerExecutor exec = new DefaultContainerExecutor(); + exec.setConf(conf); + LinkedHashSet nmVars = new LinkedHashSet<>(); + exec.writeLaunchEnv(fos, env, resources, commands, + new Path(localLogDir.getAbsolutePath()), "user", + tempFile.getName(), nmVars); + fos.flush(); + fos.close(); + FileUtil.setExecutable(tempFile, true); + Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor( + new String[]{tempFile.getAbsolutePath()}, tmpDir); + shexc.execute(); + assertThat(shexc.getExitCode()).isZero(); + File directorInfo = + new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS); + File scriptCopy = new File(localLogDir, tempFile.getName()); + Assert.assertFalse("Directory info file missing", + directorInfo.exists()); + Assert.assertFalse("Copy of launch script missing", + scriptCopy.exists()); + } finally { + // cleanup + if (shellFile != null && shellFile.exists()) { + shellFile.delete(); + } + if (tempFile != null && tempFile.exists()) { + tempFile.delete(); + } + } + } + /** * Test container launch fault. * @throws Exception