From c56e05196190f172e9c8cdcd9d59d09950f1419b Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Tue, 15 Nov 2011 10:30:26 +0000 Subject: [PATCH 1/4] MAPREDUCE-3102. Changed NodeManager to fail fast when LinuxContainerExecutor has wrong configuration or permissions. Contributed by Hitesh Shah. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1202117 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 + .../server/nodemanager/ContainerExecutor.java | 7 +++ .../nodemanager/DefaultContainerExecutor.java | 5 ++ .../nodemanager/LinuxContainerExecutor.java | 23 ++++++++ .../yarn/server/nodemanager/NodeManager.java | 5 ++ .../launcher/ContainerLaunch.java | 10 ++-- .../native/container-executor/impl/main.c | 38 +++++++++++-- .../server/nodemanager/TestNodeManager.java | 55 +++++++++++++++++++ 8 files changed, 137 insertions(+), 9 deletions(-) create mode 100644 hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index d2aba338d9..74f25008f6 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -91,6 +91,9 @@ Release 0.23.1 - Unreleased MAPREDUCE-3331. Improvement to single node cluster setup documentation for 0.23 (Anupam Seth via mahadev) + MAPREDUCE-3102. Changed NodeManager to fail fast when LinuxContainerExecutor + has wrong configuration or permissions. (Hitesh Shah via vinodkv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java index 3122592209..6c3667ae5f 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java @@ -62,6 +62,13 @@ public Configuration getConf() { return conf; } + /** + * Run the executor initialization steps. + * Verify that the necessary configs, permissions are in place. + * @throws IOException + */ + public abstract void init() throws IOException; + /** * Prepare the environment for containers in this application to execute. * For $x in local.dirs diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java index f3a3a224fe..9c252b142d 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java @@ -69,6 +69,11 @@ public DefaultContainerExecutor() { this.lfs = lfs; } + @Override + public void init() throws IOException { + // nothing to do or verify here + } + @Override public void startLocalizer(Path nmPrivateContainerTokensPath, InetSocketAddress nmAddr, String user, String appId, String locId, diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index a3cb8d77ab..2ecf2b302e 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -100,6 +100,29 @@ protected String getContainerExecutorExecutablePath(Configuration conf) { : conf.get(YarnConfiguration.NM_LINUX_CONTAINER_EXECUTOR_PATH, defaultPath); } + @Override + public void init() throws IOException { + // Send command to executor which will just start up, + // verify configuration/permissions and exit + List command = new ArrayList( + Arrays.asList(containerExecutorExe, + "--checksetup")); + String[] commandArray = command.toArray(new String[command.size()]); + ShellCommandExecutor shExec = new ShellCommandExecutor(commandArray); + if (LOG.isDebugEnabled()) { + LOG.debug("checkLinuxExecutorSetup: " + Arrays.toString(commandArray)); + } + try { + shExec.execute(); + } catch (ExitCodeException e) { + int exitCode = shExec.getExitCode(); + LOG.warn("Exit code from container is : " + exitCode); + logOutput(shExec.getOutput()); + throw new IOException("Linux container executor not configured properly" + + " (error=" + exitCode + ")", e); + } + } + @Override public void startLocalizer(Path nmPrivateContainerTokensPath, InetSocketAddress nmAddr, String user, String appId, String locId, diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 319d5a04c8..94971d365e 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -110,6 +110,11 @@ public void init(Configuration conf) { ContainerExecutor exec = ReflectionUtils.newInstance( conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class), conf); + try { + exec.init(); + } catch (IOException e) { + throw new YarnException("Failed to initialize container executor", e); + } DeletionService del = new DeletionService(exec); addService(del); diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java index 37a7966d15..f7fd522f81 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java @@ -302,10 +302,12 @@ public void cleanupContainer() throws IOException { // by this time, it will never be launched exec.deactivateContainer(containerId); - LOG.debug("Getting pid for container " + containerIdStr + " to kill" - + " from pid file " - + (pidFilePath != null ? pidFilePath.toString() : "null")); - + if (LOG.isDebugEnabled()) { + LOG.debug("Getting pid for container " + containerIdStr + " to kill" + + " from pid file " + + (pidFilePath != null ? pidFilePath.toString() : "null")); + } + // however the container process may have already started try { diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c index 6e62ef9100..40fbad8365 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c @@ -38,12 +38,15 @@ #endif void display_usage(FILE *stream) { + fprintf(stream, + "Usage: container-executor --checksetup\n"); fprintf(stream, "Usage: container-executor user command command-args\n"); fprintf(stream, "Commands:\n"); fprintf(stream, " initialize container: %2d appid tokens cmd app...\n", INITIALIZE_CONTAINER); - fprintf(stream, " launch container: %2d appid containerid workdir container-script tokens\n", + fprintf(stream, + " launch container: %2d appid containerid workdir container-script tokens pidfile\n", LAUNCH_CONTAINER); fprintf(stream, " signal container: %2d container-pid signal\n", SIGNAL_CONTAINER); @@ -52,14 +55,31 @@ void display_usage(FILE *stream) { } int main(int argc, char **argv) { - //Minimum number of arguments required to run the container-executor + int invalid_args = 0; + int do_check_setup = 0; + + LOGFILE = stdout; + ERRORFILE = stderr; + + // Minimum number of arguments required to run + // the std. container-executor commands is 4 + // 4 args not needed for checksetup option if (argc < 4) { + invalid_args = 1; + if (argc == 2) { + const char *arg1 = argv[1]; + if (strcmp("--checksetup", arg1) == 0) { + invalid_args = 0; + do_check_setup = 1; + } + } + } + + if (invalid_args != 0) { display_usage(stdout); return INVALID_ARGUMENT_NUMBER; } - LOGFILE = stdout; - ERRORFILE = stderr; int command; const char * app_id = NULL; const char * container_id = NULL; @@ -111,11 +131,19 @@ int main(int argc, char **argv) { return INVALID_CONTAINER_EXEC_PERMISSIONS; } + if (do_check_setup != 0) { + // basic setup checks done + // verified configs available and valid + // verified executor permissions + return 0; + } + //checks done for user name if (argv[optind] == NULL) { fprintf(ERRORFILE, "Invalid user name.\n"); return INVALID_USER_NAME; } + int ret = set_user(argv[optind]); if (ret != 0) { return ret; @@ -143,7 +171,7 @@ int main(int argc, char **argv) { break; case LAUNCH_CONTAINER: if (argc < 9) { - fprintf(ERRORFILE, "Too few arguments (%d vs 8) for launch container\n", + fprintf(ERRORFILE, "Too few arguments (%d vs 9) for launch container\n", argc); fflush(ERRORFILE); return INVALID_ARGUMENT_NUMBER; diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java new file mode 100644 index 0000000000..98fabe1c5d --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java @@ -0,0 +1,55 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.hadoop.yarn.server.nodemanager; + +import static org.junit.Assert.fail; + +import java.io.IOException; + +import org.apache.hadoop.yarn.YarnException; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.junit.Test; + +public class TestNodeManager { + + public static final class InvalidContainerExecutor extends + DefaultContainerExecutor { + @Override + public void init() throws IOException { + throw new IOException("dummy executor init called"); + } + } + + @Test + public void testContainerExecutorInitCall() { + NodeManager nm = new NodeManager(); + YarnConfiguration conf = new YarnConfiguration(); + conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, + InvalidContainerExecutor.class, + ContainerExecutor.class); + try { + nm.init(conf); + fail("Init should fail"); + } catch (YarnException e) { + //PASS + assert(e.getCause().getMessage().contains("dummy executor init called")); + } + } + +} From 649080131c02660bcfc0d25485a4baececb80e0b Mon Sep 17 00:00:00 2001 From: Mahadev Konar Date: Wed, 16 Nov 2011 06:29:09 +0000 Subject: [PATCH 2/4] HADOOP-7811. TestUserGroupInformation#testGetServerSideGroups test fails in chroot. (Jonathan Eagles via mahadev) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1202540 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-common-project/hadoop-common/CHANGES.txt | 3 +++ .../org/apache/hadoop/security/TestUserGroupInformation.java | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 50d2673d28..77fa3c68b6 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -113,6 +113,9 @@ Release 0.23.1 - Unreleased OPTIMIZATIONS BUG FIXES + + HADOOP-7811. TestUserGroupInformation#testGetServerSideGroups test fails in chroot. + (Jonathan Eagles via mahadev) Release 0.23.0 - 2011-11-01 diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java index 76d64d91fd..fec1d9194d 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java @@ -99,7 +99,7 @@ public void testGetServerSideGroups() throws IOException, (new InputStreamReader(pp.getInputStream())); String userName = br.readLine().trim(); // get the groups - pp = Runtime.getRuntime().exec("id -Gn"); + pp = Runtime.getRuntime().exec("id -Gn " + userName); br = new BufferedReader(new InputStreamReader(pp.getInputStream())); String line = br.readLine(); System.out.println(userName + ":" + line); From 00b50a5c94df63668b07ca1623c40fe7252f1322 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Wed, 16 Nov 2011 15:37:27 +0000 Subject: [PATCH 3/4] MAPREDUCE-3355. Fixed MR AM's ContainerLauncher to handle node-command timeouts correctly. (vinodkv) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1202744 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 + .../app/launcher/ContainerLauncherImpl.java | 90 ++++++++++++++----- .../v2/app/TestContainerLauncher.java | 23 +++-- 3 files changed, 86 insertions(+), 30 deletions(-) diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 74f25008f6..53e2efaba8 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -94,6 +94,9 @@ Release 0.23.1 - Unreleased MAPREDUCE-3102. Changed NodeManager to fail fast when LinuxContainerExecutor has wrong configuration or permissions. (Hitesh Shah via vinodkv) + MAPREDUCE-3355. Fixed MR AM's ContainerLauncher to handle node-command + timeouts correctly. (vinodkv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/launcher/ContainerLauncherImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/launcher/ContainerLauncherImpl.java index 62ceae99f9..f1670039e6 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/launcher/ContainerLauncherImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/launcher/ContainerLauncherImpl.java @@ -82,6 +82,7 @@ public class ContainerLauncherImpl extends AbstractService implements private Thread eventHandlingThread; private BlockingQueue eventQueue = new LinkedBlockingQueue(); + final Timer commandTimer = new Timer(true); YarnRPC rpc; // To track numNodes. @@ -201,14 +202,14 @@ public ContainerManager run() { return proxy; } - private static class CommandTimer extends TimerTask { + private static class CommandTimerTask extends TimerTask { private final Thread commandThread; - protected final ContainerLauncherEvent event; protected final String message; + private boolean cancelled = false; - public CommandTimer(Thread thread, ContainerLauncherEvent event) { + public CommandTimerTask(Thread thread, ContainerLauncherEvent event) { + super(); this.commandThread = thread; - this.event = event; this.message = "Couldn't complete " + event.getType() + " on " + event.getContainerID() + "/" + event.getTaskAttemptID() + ". Interrupting and returning"; @@ -216,8 +217,27 @@ public CommandTimer(Thread thread, ContainerLauncherEvent event) { @Override public void run() { - LOG.warn(this.message); - this.commandThread.interrupt(); + synchronized (this) { + if (this.cancelled) { + return; + } + LOG.warn(this.message); + StackTraceElement[] trace = this.commandThread.getStackTrace(); + StringBuilder logMsg = new StringBuilder(); + for (int i = 0; i < trace.length; i++) { + logMsg.append("\n\tat " + trace[i]); + } + LOG.info("Stack trace of the command-thread: \n" + logMsg.toString()); + this.commandThread.interrupt(); + } + } + + @Override + public boolean cancel() { + synchronized (this) { + this.cancelled = true; + return super.cancel(); + } } } @@ -243,10 +263,11 @@ public void run() { ContainerToken containerToken = event.getContainerToken(); TaskAttemptId taskAttemptID = event.getTaskAttemptID(); - Timer timer = new Timer(true); - ContainerManager proxy = null; + CommandTimerTask timerTask = new CommandTimerTask(Thread + .currentThread(), event); + switch(event.getType()) { case CONTAINER_REMOTE_LAUNCH: @@ -254,16 +275,16 @@ public void run() { = (ContainerRemoteLaunchEvent) event; try { - timer.schedule(new CommandTimer(Thread.currentThread(), event), - nmTimeOut); + commandTimer.schedule(timerTask, nmTimeOut); proxy = getCMProxy(containerID, containerManagerBindAddr, containerToken); // Interruped during getProxy, but that didn't throw exception - if (Thread.currentThread().isInterrupted()) { + if (Thread.interrupted()) { // The timer cancelled the command in the mean while. - String message = "Start-container for " + event.getContainerID() + String message = "Container launch failed for " + containerID + + " : Start-container for " + event.getContainerID() + " got interrupted. Returning."; sendContainerLaunchFailedMsg(taskAttemptID, message); return; @@ -280,11 +301,12 @@ public void run() { StartContainerResponse response = proxy.startContainer(startRequest); // container started properly. Stop the timer - timer.cancel(); - if (Thread.currentThread().isInterrupted()) { + timerTask.cancel(); + if (Thread.interrupted()) { // The timer cancelled the command in the mean while, but // startContainer didn't throw exception - String message = "Start-container for " + event.getContainerID() + String message = "Container launch failed for " + containerID + + " : Start-container for " + event.getContainerID() + " got interrupted. Returning."; sendContainerLaunchFailedMsg(taskAttemptID, message); return; @@ -309,12 +331,19 @@ public void run() { context.getEventHandler().handle( new TaskAttemptContainerLaunchedEvent(taskAttemptID, port)); } catch (Throwable t) { + if (Thread.interrupted()) { + // The timer cancelled the command in the mean while. + LOG.info("Start-container for " + event.getContainerID() + + " got interrupted."); + } String message = "Container launch failed for " + containerID + " : " + StringUtils.stringifyException(t); sendContainerLaunchFailedMsg(taskAttemptID, message); } finally { - timer.cancel(); - ContainerLauncherImpl.this.rpc.stopProxy(proxy, getConfig()); + timerTask.cancel(); + if (proxy != null) { + ContainerLauncherImpl.this.rpc.stopProxy(proxy, getConfig()); + } } break; @@ -331,13 +360,12 @@ public void run() { } else { try { - timer.schedule(new CommandTimer(Thread.currentThread(), event), - nmTimeOut); + commandTimer.schedule(timerTask, nmTimeOut); proxy = getCMProxy(containerID, containerManagerBindAddr, containerToken); - if (Thread.currentThread().isInterrupted()) { + if (Thread.interrupted()) { // The timer cancelled the command in the mean while. No need to // return, send cleanedup event anyways. LOG.info("Stop-container for " + event.getContainerID() @@ -353,6 +381,14 @@ public void run() { proxy.stopContainer(stopRequest); } } catch (Throwable t) { + + if (Thread.interrupted()) { + // The timer cancelled the command in the mean while, clear the + // interrupt flag + LOG.info("Stop-container for " + event.getContainerID() + + " got interrupted."); + } + // ignore the cleanup failure String message = "cleanup failed for container " + event.getContainerID() + " : " @@ -363,8 +399,18 @@ public void run() { message)); LOG.warn(message); } finally { - timer.cancel(); - ContainerLauncherImpl.this.rpc.stopProxy(proxy, getConfig()); + timerTask.cancel(); + if (Thread.interrupted()) { + LOG.info("Stop-container for " + event.getContainerID() + + " got interrupted."); + // ignore the cleanup failure + context.getEventHandler() + .handle(new TaskAttemptDiagnosticsUpdateEvent(taskAttemptID, + "cleanup failed for container " + event.getContainerID())); + } + if (proxy != null) { + ContainerLauncherImpl.this.rpc.stopProxy(proxy, getConfig()); + } } // after killing, send killed event to taskattempt diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestContainerLauncher.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestContainerLauncher.java index b2686e2314..860133fada 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestContainerLauncher.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestContainerLauncher.java @@ -88,11 +88,19 @@ private void test(boolean swallowInterrupts) throws Exception { app.waitForState(job, JobState.FAILED); - LOG.info("attempt.getDiagnostics: " + attempt.getDiagnostics()); - Assert.assertTrue(attempt.getDiagnostics().toString().contains( - "Container launch failed for container_0_0000_01_000000 : ")); - Assert.assertTrue(attempt.getDiagnostics().toString().contains( - ": java.lang.InterruptedException")); + String diagnostics = attempt.getDiagnostics().toString(); + LOG.info("attempt.getDiagnostics: " + diagnostics); + if (swallowInterrupts) { + Assert.assertEquals("[Container launch failed for " + + "container_0_0000_01_000000 : Start-container for " + + "container_0_0000_01_000000 got interrupted. Returning.]", + diagnostics); + } else { + Assert.assertTrue(diagnostics.contains("Container launch failed for " + + "container_0_0000_01_000000 : ")); + Assert.assertTrue(diagnostics + .contains(": java.lang.InterruptedException")); + } app.stop(); } @@ -119,11 +127,10 @@ protected ContainerManager getCMProxy(ContainerId containerID, } } catch (InterruptedException e) { LOG.info(e); - if (!swallowInterrupts) { + if (!MRAppWithSlowNM.this.swallowInterrupts) { throw new IOException(e); - } else { - Thread.currentThread().interrupt(); } + Thread.currentThread().interrupt(); } return null; } From 9d7402e0af79bd31cf012f749e2df5e404fad7c8 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Wed, 16 Nov 2011 16:30:05 +0000 Subject: [PATCH 4/4] MAPREDUCE-3407. Fixed pom files to refer to the correct MR app-jar needed by the integration tests. Contributed by Hitesh Shah. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1202766 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 9 ++++++--- .../hadoop-mapreduce-client-jobclient/pom.xml | 5 ++++- .../hadoop-yarn-applications-distributedshell/pom.xml | 5 ++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 53e2efaba8..e4e086cb80 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -94,9 +94,6 @@ Release 0.23.1 - Unreleased MAPREDUCE-3102. Changed NodeManager to fail fast when LinuxContainerExecutor has wrong configuration or permissions. (Hitesh Shah via vinodkv) - MAPREDUCE-3355. Fixed MR AM's ContainerLauncher to handle node-command - timeouts correctly. (vinodkv) - OPTIMIZATIONS BUG FIXES @@ -138,6 +135,12 @@ Release 0.23.1 - Unreleased MAPREDUCE-3324. Not All HttpServer tools links (stacks,logs,config,metrics) are accessible through all UI servers (Jonathan Eagles via mahadev) + MAPREDUCE-3355. Fixed MR AM's ContainerLauncher to handle node-command + timeouts correctly. (vinodkv) + + MAPREDUCE-3407. Fixed pom files to refer to the correct MR app-jar needed + by the integration tests. (Hitesh Shah via vinodkv) + Release 0.23.0 - 2011-11-01 INCOMPATIBLE CHANGES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml index ee4e7b4cbe..d1f06ce9b1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml @@ -102,8 +102,11 @@ maven-surefire-plugin - ${project.build.directory}/${project.artifactId}-${project.version}.jar + ${project.parent.basedir}/hadoop-mapreduce-client-app/target/hadoop-mapreduce-client-app-${project.version}.jar + + ${java.home} + ${project.build.directory}/${project.artifactId}-${project.version}-tests.jar diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml index 60baa84d7f..a6aff52546 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml @@ -112,7 +112,10 @@ ${project.build.directory}/${project.artifactId}-${project.version}.jar - + + ${java.home} + +