From ed9d60e888d0acfd748fda7f66249f5b79a3ed6d Mon Sep 17 00:00:00 2001 From: Eric Yang Date: Fri, 27 Jul 2018 19:33:58 -0400 Subject: [PATCH] YARN-8508. Release GPU resource for killed container. Contributed by Chandni Singh --- .../nodemanager/LinuxContainerExecutor.java | 34 +++++++++---------- .../TestLinuxContainerExecutor.java | 9 ++++- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 03b88a4499..4253f2f0d5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -573,15 +573,7 @@ private int handleLaunchForLaunchType(ContainerStartContext ctx, return handleExitCode(e, container, containerId); } finally { resourcesHandler.postExecute(containerId); - - try { - if (resourceHandlerChain != null) { - resourceHandlerChain.postComplete(containerId); - } - } catch (ResourceHandlerException e) { - LOG.warn("ResourceHandlerChain.postComplete failed for " + - "containerId: " + containerId + ". Exception: " + e); - } + postComplete(containerId); } return 0; @@ -721,14 +713,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx) return super.reacquireContainer(ctx); } finally { resourcesHandler.postExecute(containerId); - if (resourceHandlerChain != null) { - try { - resourceHandlerChain.postComplete(containerId); - } catch (ResourceHandlerException e) { - LOG.warn("ResourceHandlerChain.postComplete failed for " + - "containerId: " + containerId + " Exception: " + e); - } - } + postComplete(containerId); } } @@ -798,6 +783,8 @@ public boolean reapContainer(ContainerReapContext ctx) throws IOException { logOutput(e.getOutput()); throw new IOException("Error in reaping container " + container.getContainerId().toString() + " exit = " + retCode, e); + } finally { + postComplete(container.getContainerId()); } return true; } @@ -968,4 +955,17 @@ public void removeDockerContainer(String containerId) { LOG.warn("Unable to remove docker container: " + containerId); } } + + @VisibleForTesting + void postComplete(final ContainerId containerId) { + try { + if (resourceHandlerChain != null) { + LOG.debug("{} post complete", containerId); + resourceHandlerChain.postComplete(containerId); + } + } catch (ResourceHandlerException e) { + LOG.warn("ResourceHandlerChain.postComplete failed for " + + "containerId: {}. Exception: ", containerId, e); + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java index ddbf3b9056..6d77fc488d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java @@ -25,11 +25,14 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import static org.mockito.Matchers.anyObject; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntime; import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerReapContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,6 +43,7 @@ import java.io.PrintWriter; import java.net.InetSocketAddress; import java.util.ArrayList; +import java.util.Arrays; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; @@ -667,12 +671,15 @@ public void testRemoveDockerContainer() throws Exception { @Test public void testReapContainer() throws Exception { Container container = mock(Container.class); - LinuxContainerExecutor lce = mock(LinuxContainerExecutor.class); + LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class); + LinuxContainerExecutor lce = spy(new LinuxContainerExecutor( + containerRuntime)); ContainerReapContext.Builder builder = new ContainerReapContext.Builder(); builder.setContainer(container).setUser("foo"); ContainerReapContext ctx = builder.build(); lce.reapContainer(ctx); verify(lce, times(1)).reapContainer(ctx); + verify(lce, times(1)).postComplete(anyObject()); } @Test