From 79d3d35398cb5348cfd62e41e3318ec7a337421a Mon Sep 17 00:00:00 2001 From: Eric Badger Date: Fri, 26 Apr 2019 01:21:28 +0000 Subject: [PATCH] YARN-9486. Docker container exited with failure does not get clean up correctly. Contributed by Eric Yang --- .../containermanager/launcher/ContainerCleanup.java | 6 ++++-- .../launcher/ContainerRelaunch.java | 9 ++++++++- .../launcher/TestContainerCleanup.java | 13 +++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerCleanup.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerCleanup.java index b63becfe59..faf926aab6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerCleanup.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerCleanup.java @@ -95,8 +95,10 @@ public void run() { + " killed in store", e); } - // launch flag will be set to true if process already launched - boolean alreadyLaunched = !launch.markLaunched(); + // launch flag will be set to true if process already launched, + // in process of launching, or failed to launch. + boolean alreadyLaunched = !launch.markLaunched() || + launch.isLaunchCompleted(); if (!alreadyLaunched) { LOG.info("Container " + containerIdStr + " not launched." + " No cleanup needed to be done"); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java index 226b53d2ca..f1656059a4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java @@ -87,7 +87,14 @@ public Integer call() { Path nmPrivateTruststorePath = (container.getCredentials().getSecretKey( AMSecretKeys.YARN_APPLICATION_AM_TRUSTSTORE) == null) ? null : getNmPrivateTruststorePath(appIdStr, containerIdStr); - pidFilePath = getPidFilePath(appIdStr, containerIdStr); + try { + // try to locate existing pid file. + pidFilePath = getPidFilePath(appIdStr, containerIdStr); + } catch (IOException e) { + // reset pid file path if it did not exist. + String pidFileSubpath = getPidFileSubpath(appIdStr, containerIdStr); + pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath); + } LOG.info("Relaunch container with " + "workDir = " + containerWorkDir.toString() diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerCleanup.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerCleanup.java index 6c99379655..17574df02b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerCleanup.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerCleanup.java @@ -79,6 +79,7 @@ public void setup() throws Exception { launch = mock(ContainerLaunch.class); launch.containerAlreadyLaunched = new AtomicBoolean(false); + launch.completed = new AtomicBoolean(false); launch.pidFilePath = new Path("target/" + containerId.toString() + ".pid"); when(launch.getContainerPid()).thenReturn(containerId.toString()); @@ -105,4 +106,16 @@ public void testCleanup() throws Exception { Assert.assertEquals("signal", ContainerExecutor.Signal.TERM, captor.getValue().getSignal()); } + + @Test + public void testFailedExitCleanup() throws Exception { + launch.completed.set(true); + cleanup.run(); + ArgumentCaptor captor = + ArgumentCaptor.forClass(ContainerSignalContext.class); + + verify(executor, Mockito.times(1)).signalContainer(captor.capture()); + Assert.assertEquals("signal", ContainerExecutor.Signal.TERM, + captor.getValue().getSignal()); + } }