From 69b328943edf2f61c8fc139934420e3f10bf3813 Mon Sep 17 00:00:00 2001 From: Robert Kanter Date: Wed, 24 Oct 2018 13:15:50 -0700 Subject: [PATCH] YARN-8929. DefaultOOMHandler should only pick running containers to kill upon oom events (haibochen via rkanter) --- .../linux/resources/CGroupsHandler.java | 2 +- .../linux/resources/CGroupsHandlerImpl.java | 4 +- .../linux/resources/DefaultOOMHandler.java | 45 +- .../resources/TestCGroupsHandlerImpl.java | 2 +- .../resources/TestDefaultOOMHandler.java | 434 ++++++++++++++---- 5 files changed, 389 insertions(+), 98 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java index 9dc16c37a2..dcb058961d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java @@ -71,7 +71,7 @@ public static Set getValidCGroups() { } } - String CGROUP_FILE_TASKS = "tasks"; + String CGROUP_PROCS_FILE = "cgroup.procs"; String CGROUP_PARAM_CLASSID = "classid"; String CGROUP_PARAM_BLKIO_WEIGHT = "weight"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java index a547e8fd9f..050d0a8030 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java @@ -347,7 +347,7 @@ public String getPathForCGroup(CGroupController controller, String cGroupId) { public String getPathForCGroupTasks(CGroupController controller, String cGroupId) { return getPathForCGroup(controller, cGroupId) - + Path.SEPARATOR + CGROUP_FILE_TASKS; + + Path.SEPARATOR + CGROUP_PROCS_FILE; } @Override @@ -603,7 +603,7 @@ public void updateCGroupParam(CGroupController controller, String cGroupId, public String getCGroupParam(CGroupController controller, String cGroupId, String param) throws ResourceHandlerException { String cGroupParamPath = - param.equals(CGROUP_FILE_TASKS) ? + param.equals(CGROUP_PROCS_FILE) ? getPathForCGroup(controller, cGroupId) + Path.SEPARATOR + param : getPathForCGroupParam(controller, cGroupId, param); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java index 86137b514a..844bb6c414 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java @@ -34,7 +34,7 @@ import java.util.ArrayList; import java.util.Collections; -import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PROCS_FILE; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_USAGE_BYTES; @@ -116,8 +116,10 @@ private boolean isContainerOutOfLimit(Container container) { * Currently the killing only succeeds for PGIDS. * * @param container Container to clean up + * @return true if the container is killed successfully, false otherwise */ - private void sigKill(Container container) { + private boolean sigKill(Container container) { + boolean containerKilled = false; boolean finished = false; try { while (!finished) { @@ -125,7 +127,7 @@ private void sigKill(Container container) { cgroups.getCGroupParam( CGroupsHandler.CGroupController.MEMORY, container.getContainerId().toString(), - CGROUP_FILE_TASKS) + CGROUP_PROCS_FILE) .split("\n"); finished = true; for (String pid : pids) { @@ -154,11 +156,17 @@ private void sigKill(Container container) { LOG.debug("Interrupted while waiting for processes to disappear"); } } + containerKilled = true; } catch (ResourceHandlerException ex) { + // the tasks file of the container may not be available because the + // container may not have been launched at this point when the root + // cgroup is under oom LOG.warn(String.format( "Cannot list more tasks in container %s to kill.", container.getContainerId())); } + + return containerKilled; } /** @@ -216,19 +224,34 @@ protected boolean killContainer() { ArrayList candidates = new ArrayList<>(0); for (Container container : context.getContainers().values()) { + if (!container.isRunning()) { + // skip containers that are not running yet because killing them + // won't release any memory to get us out of OOM. + continue; + // note even if it is indicated that the container is running from + // container.isRunning(), the container process might not have been + // running yet. From NM's perspective, a container is running as + // soon as the container launch is handed over the container executor + } candidates.add( new ContainerCandidate(container, isContainerOutOfLimit(container))); } Collections.sort(candidates); + if (candidates.isEmpty()) { + LOG.warn( + "Found no running containers to kill in order to release memory"); + } - if (candidates.size() > 0) { - ContainerCandidate candidate = candidates.get(0); - sigKill(candidate.container); - String message = String.format( - "container %s killed by elastic cgroups OOM handler.", - candidate.container.getContainerId()); - LOG.warn(message); - containerKilled = true; + // make sure one container is killed successfully to release memory + for(int i = 0; !containerKilled && i < candidates.size(); i++) { + ContainerCandidate candidate = candidates.get(i); + if (sigKill(candidate.container)) { + String message = String.format( + "container %s killed by elastic cgroups OOM handler.", + candidate.container.getContainerId()); + LOG.warn(message); + containerKilled = true; + } } return containerKilled; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java index 0d7c097896..ea6fb529a3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java @@ -266,7 +266,7 @@ public void testCGroupPaths() throws IOException { Assert.assertEquals(expectedPath, path); String expectedPathTasks = expectedPath + Path.SEPARATOR - + CGroupsHandler.CGROUP_FILE_TASKS; + + CGroupsHandler.CGROUP_PROCS_FILE; path = cGroupsHandler.getPathForCGroupTasks(controller, testCGroup); Assert.assertEquals(expectedPathTasks, path); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java index e239067832..8a6ca74609 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java @@ -33,7 +33,7 @@ import java.io.IOException; import java.util.concurrent.ConcurrentHashMap; -import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PROCS_FILE; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_USAGE_BYTES; @@ -75,16 +75,49 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two guaranteed containers, both of which are out of limit. + * Test an OOM situation where there are no running containers that + * can be killed. + */ + @Test(expected = YarnRuntimeException.class) + public void testExceptionThrownWithNoRunningContainersToKill() + throws Exception { + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(1, true, 1L, false); + containers.put(c1.getContainerId(), c1); + + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1").thenReturn("under_oom 0"); + + DefaultOOMHandler handler = new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + + handler.run(); + } + + /** + * We have two running guaranteed containers, both of which are out of limit. * We should kill the later one. */ @Test - public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception { + public void testBothRunningGuaranteedContainersOverLimitUponOOM() + throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(1, true, 1L); + Container c1 = createContainer(1, true, 1L, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(2, true, 2L); + Container c2 = createContainer(2, true, 2L, true); containers.put(c2.getContainerId(), c2); ContainerExecutor ex = createContainerExecutor(containers); @@ -100,7 +133,7 @@ public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception { CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -109,7 +142,7 @@ public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(11)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -139,7 +172,7 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two GUARANTEED containers, one of which is out of limit. + * We have two running GUARANTEED containers, one of which is out of limit. * We should kill the one that's out of its limit. This should * happen even if it was launched earlier than the other one. */ @@ -147,9 +180,9 @@ protected CGroupsHandler getCGroupsHandler() { public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(1, true, 2L); + Container c1 = createContainer(1, true, 2L, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(2, true, 1L); + Container c2 = createContainer(2, true, 1L, true); containers.put(c2.getContainerId(), c2); ContainerExecutor ex = createContainerExecutor(containers); @@ -164,7 +197,7 @@ public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception { CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -175,7 +208,7 @@ public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception { // container c2 is out of its limit when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -204,16 +237,16 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two GUARANTEE containers, neither of which is out of limit. + * We have two running GUARANTEE containers, neither of which is out of limit. * We should kill the later launched one. */ @Test public void testNoGuaranteedContainerOverLimitOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(1, true, 1L); + Container c1 = createContainer(1, true, 1L, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(2, true, 2L); + Container c2 = createContainer(2, true, 2L, true); containers.put(c2.getContainerId(), c2); ContainerExecutor ex = createContainerExecutor(containers); @@ -228,7 +261,7 @@ public void testNoGuaranteedContainerOverLimitOOM() throws Exception { CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -237,7 +270,7 @@ public void testNoGuaranteedContainerOverLimitOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -266,17 +299,250 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two opportunistic containers, both of which are out of limit. - * We should kill the later one. + * We have two OPPORTUNISTIC containers, one running and the other not. + * We should kill the running one. + */ + @Test + public void testKillOnlyRunningContainersUponOOM() throws Exception { + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(1, false, 1L, false); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(2, false, 2L, true); + containers.put(c2.getContainerId(), c2); + + ContainerExecutor ex = createContainerExecutor(containers); + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1").thenReturn("under_oom 0"); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) + .thenReturn(getMB(9)); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) + .thenReturn(getMB(9)); + + DefaultOOMHandler handler = + new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1235") + .setContainer(c2) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(1)).signalContainer(any()); + } + + + /** + * We have two 'running' OPPORTUNISTIC containers. Killing the most- + * recently launched one fails because its cgroup.procs file is not + * available. The other OPPORTUNISTIC containers should be killed in + * this case. + */ + @Test + public void testKillOpportunisticContainerWithKillFailuresUponOOM() + throws Exception { + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(1, false, 1L, true); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(2, false, 2L, true); + containers.put(c2.getContainerId(), c2); + + ContainerExecutor ex = createContainerExecutor(containers); + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1").thenReturn("under_oom 0"); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) + .thenReturn(getMB(9)); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) + .thenReturn(getMB(9)); + // c2 process has not started, hence no cgroup.procs file yet + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenThrow( + new ResourceHandlerException(CGROUP_PROCS_FILE + " not found")); + + DefaultOOMHandler handler = + new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1235") + .setContainer(c1) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(1)).signalContainer(any()); + } + + /** + * We have two 'running' OPPORTUNISTIC containers and one GUARANTEED + * container. Killing two OPPORTUNISTIC containers fails because they + * have not really started running as processes since the root cgroup + * is under oom. We should try to kill one container successfully. In + * this case, the GUARANTEED container should be killed. + */ + @Test + public void testKillGuaranteedContainerWithKillFailuresUponOOM() + throws Exception { + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(1, false, 1L, true); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(2, false, 2L, true); + containers.put(c2.getContainerId(), c2); + Container c3 = createContainer(3, true, 2L, true); + containers.put(c3.getContainerId(), c3); + + ContainerExecutor ex = createContainerExecutor(containers); + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1").thenReturn("under_oom 0"); + // c1 process has not started, hence no cgroup.procs file yet + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenThrow( + new ResourceHandlerException(CGROUP_PROCS_FILE + " not found")); + // c2 process has not started, hence no cgroup.procs file yet + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenThrow( + new ResourceHandlerException(CGROUP_PROCS_FILE + " not found")); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) + .thenReturn(getMB(9)); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) + .thenReturn(getMB(9)); + + DefaultOOMHandler handler = + new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1235") + .setContainer(c3) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(1)).signalContainer(any()); + } + + /** + * Test an OOM situation where no containers are killed successfully. + * + * We have two 'running' containers, none of which are actually + * running as processes. Their cgroup.procs file is not available, + * so kill them won't succeed. + */ + @Test(expected = YarnRuntimeException.class) + public void testExceptionThrownWhenNoContainersKilledSuccessfully() + throws Exception { + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(1, false, 1L, true); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(2, false, 2L, true); + containers.put(c2.getContainerId(), c2); + + ContainerExecutor ex = createContainerExecutor(containers); + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1").thenReturn("under_oom 0"); + // c1 process has not started, hence no cgroup.procs file yet + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenThrow( + new ResourceHandlerException(CGROUP_PROCS_FILE + " not found")); + // c2 process has not started, hence no cgroup.procs file yet + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) + .thenThrow( + new ResourceHandlerException(CGROUP_PROCS_FILE + " not found")); + + DefaultOOMHandler handler = + new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + } + + /** + * We have two running opportunistic containers, both of which are out of + * limit. We should kill the later one. */ @Test public void testBothOpportunisticContainersOverLimitUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(1, false, 1L); + Container c1 = createContainer(1, false, 1L, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(2, false, 2L); + Container c2 = createContainer(2, false, 2L, true); containers.put(c2.getContainerId(), c2); ContainerExecutor ex = createContainerExecutor(containers); @@ -292,7 +558,7 @@ public void testBothOpportunisticContainersOverLimitUponOOM() CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -301,7 +567,7 @@ public void testBothOpportunisticContainersOverLimitUponOOM() c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(11)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -331,17 +597,17 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers, one of which is out of limit. - * We should kill the one that's out of its limit. This should + * We have two running OPPORTUNISTIC containers, one of which is out of + * limit. We should kill the one that's out of its limit. This should * happen even if it was launched earlier than the other one. */ @Test public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(1, false, 2L); + Container c1 = createContainer(1, false, 2L, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(2, false, 1L); + Container c2 = createContainer(2, false, 1L, true); containers.put(c2.getContainerId(), c2); ContainerExecutor ex = createContainerExecutor(containers); @@ -356,7 +622,7 @@ public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception { CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -366,7 +632,7 @@ public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception { .thenReturn(getMB(9)); // contnainer c2 is out of its limit when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -395,16 +661,16 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers, neither of which is out of limit. - * We should kill the later one. + * We have two running OPPORTUNISTIC containers, neither of which is out of + * limit. We should kill the later one. */ @Test public void testNoOpportunisticContainerOverLimitOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(1, false, 1L); + Container c1 = createContainer(1, false, 1L, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(2, false, 2L); + Container c2 = createContainer(2, false, 2L, true); containers.put(c2.getContainerId(), c2); ContainerExecutor ex = createContainerExecutor(containers); @@ -419,7 +685,7 @@ public void testNoOpportunisticContainerOverLimitOOM() throws Exception { CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -428,7 +694,7 @@ public void testNoOpportunisticContainerOverLimitOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -457,8 +723,8 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers and one GUARANTEED container. - * One of the OPPORTUNISTIC container is out of limit. + * We have two running OPPORTUNISTIC containers and one running GUARANTEED + * container. One of the OPPORTUNISTIC container is out of limit. * OOM is resolved after killing the OPPORTUNISTIC container that * exceeded its limit even though it is launched earlier than the * other OPPORTUNISTIC container. @@ -469,11 +735,11 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM() ConcurrentHashMap containers = new ConcurrentHashMap<>(); int currentContainerId = 0; - Container c1 = createContainer(currentContainerId++, false, 2); + Container c1 = createContainer(currentContainerId++, false, 2, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(currentContainerId++, false, 1); + Container c2 = createContainer(currentContainerId++, false, 1, true); containers.put(c2.getContainerId(), c2); - Container c3 = createContainer(currentContainerId++, true, 1); + Container c3 = createContainer(currentContainerId++, true, 1, true); containers.put(c3.getContainerId(), c3); ContainerExecutor ex = createContainerExecutor(containers); @@ -489,7 +755,7 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM() .thenReturn("under_oom 1") .thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -500,7 +766,7 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM() // container c2 is out of its limit when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -510,7 +776,7 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM() .thenReturn(getMB(11)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1236").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -538,8 +804,8 @@ protected CGroupsHandler getCGroupsHandler() { verify(ex, times(1)).signalContainer(any()); } /** - * We have two OPPORTUNISTIC containers and one GUARANTEED container. - * None of the containers exceeded its memory limit. + * We have two running OPPORTUNISTIC containers and one running GUARANTEED + * container. None of the containers exceeded its memory limit. * OOM is resolved after killing the most recently launched OPPORTUNISTIC * container. */ @@ -548,11 +814,11 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); int currentContainerId = 0; - Container c1 = createContainer(currentContainerId++, false, 1); + Container c1 = createContainer(currentContainerId++, false, 1, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(currentContainerId++, false, 2); + Container c2 = createContainer(currentContainerId++, false, 2, true); containers.put(c2.getContainerId(), c2); - Container c3 = createContainer(currentContainerId++, true, 1); + Container c3 = createContainer(currentContainerId++, true, 1, true); containers.put(c3.getContainerId(), c3); ContainerExecutor ex = createContainerExecutor(containers); @@ -568,7 +834,7 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception { .thenReturn("under_oom 1") .thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -577,7 +843,7 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -586,7 +852,7 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception { c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1236").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -615,8 +881,8 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers and one GUARANTEED container. - * One of the OPPORTUNISTIC container is out of limit. + * We have two running OPPORTUNISTIC containers and one running GUARANTEED + * container. One of the OPPORTUNISTIC container is out of limit. * OOM is resolved after killing both OPPORTUNISTIC containers. */ @Test @@ -625,11 +891,11 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(currentContainerId++, false, 2); + Container c1 = createContainer(currentContainerId++, false, 2, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(currentContainerId++, false, 1); + Container c2 = createContainer(currentContainerId++, false, 1, true); containers.put(c2.getContainerId(), c2); - Container c3 = createContainer(currentContainerId++, true, 1); + Container c3 = createContainer(currentContainerId++, true, 1, true); containers.put(c3.getContainerId(), c3); ContainerExecutor ex = createContainerExecutor(containers); @@ -646,7 +912,7 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception { .thenReturn("under_oom 1") .thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -655,7 +921,7 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -664,7 +930,7 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception { c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(11)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1236").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -701,8 +967,8 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers and one GUARANTEED container. - * the GUARANTEED container is out of limit. OOM is resolved + * We have two running OPPORTUNISTIC containers and one running GUARANTEED + * container. The GUARANTEED container is out of limit. OOM is resolved * after first killing the two OPPORTUNISTIC containers and then the * GUARANTEED container. */ @@ -712,11 +978,11 @@ public void testKillGuaranteedContainerUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(currentContainerId++, false, 2); + Container c1 = createContainer(currentContainerId++, false, 2, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(currentContainerId++, false, 1); + Container c2 = createContainer(currentContainerId++, false, 1, true); containers.put(c2.getContainerId(), c2); - Container c3 = createContainer(currentContainerId++, true, 1); + Container c3 = createContainer(currentContainerId++, true, 1, true); containers.put(c3.getContainerId(), c3); ContainerExecutor ex = createContainerExecutor(containers); @@ -734,7 +1000,7 @@ public void testKillGuaranteedContainerUponOOM() throws Exception { .thenReturn("under_oom 1") .thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -743,7 +1009,7 @@ public void testKillGuaranteedContainerUponOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -752,7 +1018,7 @@ public void testKillGuaranteedContainerUponOOM() throws Exception { c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1236").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -795,8 +1061,8 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers and one GUARANTEED container. - * None of the containers exceeded its memory limit. + * We have two running OPPORTUNISTIC containers and one running GUARANTEED + * container. None of the containers exceeded its memory limit. * OOM is resolved after killing all running containers. */ @Test @@ -805,11 +1071,11 @@ public void testKillAllContainersUponOOM() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(currentContainerId++, false, 1); + Container c1 = createContainer(currentContainerId++, false, 1, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(currentContainerId++, false, 2); + Container c2 = createContainer(currentContainerId++, false, 2, true); containers.put(c2.getContainerId(), c2); - Container c3 = createContainer(currentContainerId++, true, 1); + Container c3 = createContainer(currentContainerId++, true, 1, true); containers.put(c3.getContainerId(), c3); ContainerExecutor ex = createContainerExecutor(containers); @@ -827,7 +1093,7 @@ public void testKillAllContainersUponOOM() throws Exception { .thenReturn("under_oom 1") .thenReturn("under_oom 0"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -836,7 +1102,7 @@ public void testKillAllContainersUponOOM() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -845,7 +1111,7 @@ public void testKillAllContainersUponOOM() throws Exception { c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1236").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -888,7 +1154,8 @@ protected CGroupsHandler getCGroupsHandler() { } /** - * We have two OPPORTUNISTIC containers and one GUARANTEED container. + * We have two running OPPORTUNISTIC containers and one running + * GUARANTEED container. * None of the containers exceeded its memory limit. * OOM is not resolved even after killing all running containers. * A YarnRuntimeException is excepted to be thrown. @@ -899,11 +1166,11 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception { ConcurrentHashMap containers = new ConcurrentHashMap<>(); - Container c1 = createContainer(currentContainerId++, false, 1); + Container c1 = createContainer(currentContainerId++, false, 1, true); containers.put(c1.getContainerId(), c1); - Container c2 = createContainer(currentContainerId++, false, 2); + Container c2 = createContainer(currentContainerId++, false, 2, true); containers.put(c2.getContainerId(), c2); - Container c3 = createContainer(currentContainerId++, true, 3); + Container c3 = createContainer(currentContainerId++, true, 3, true); containers.put(c3.getContainerId(), c3); ContainerExecutor ex = createContainerExecutor(containers); @@ -921,7 +1188,7 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception { .thenReturn("under_oom 1") .thenReturn("under_oom 1"); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + c1.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1234").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -930,7 +1197,7 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception { c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + c2.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1235").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -939,7 +1206,7 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception { c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) .thenReturn(getMB(9)); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, - c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + c3.getContainerId().toString(), CGROUP_PROCS_FILE)) .thenReturn("1236").thenReturn(""); when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES)) @@ -974,7 +1241,7 @@ private static ContainerId createContainerId(int id) { } private static Container createContainer(int containerId, - boolean guaranteed, long launchTime) { + boolean guaranteed, long launchTime, boolean running) { Container c1 = mock(Container.class); ContainerId cid1 = createContainerId(containerId); when(c1.getContainerId()).thenReturn(cid1); @@ -987,6 +1254,7 @@ private static Container createContainer(int containerId, when(c1.getResource()).thenReturn(Resource.newInstance(10, 1)); when(c1.getContainerLaunchTime()).thenReturn(launchTime); + when(c1.isRunning()).thenReturn(running); return c1; }