YARN-8929. DefaultOOMHandler should only pick running containers to kill upon oom events (haibochen via rkanter)
This commit is contained in:
parent
ebf8e1731d
commit
69b328943e
@ -71,7 +71,7 @@ public static Set<String> getValidCGroups() {
|
||||
}
|
||||
}
|
||||
|
||||
String CGROUP_FILE_TASKS = "tasks";
|
||||
String CGROUP_PROCS_FILE = "cgroup.procs";
|
||||
String CGROUP_PARAM_CLASSID = "classid";
|
||||
String CGROUP_PARAM_BLKIO_WEIGHT = "weight";
|
||||
|
||||
|
@ -347,7 +347,7 @@ public String getPathForCGroup(CGroupController controller, String cGroupId) {
|
||||
public String getPathForCGroupTasks(CGroupController controller,
|
||||
String cGroupId) {
|
||||
return getPathForCGroup(controller, cGroupId)
|
||||
+ Path.SEPARATOR + CGROUP_FILE_TASKS;
|
||||
+ Path.SEPARATOR + CGROUP_PROCS_FILE;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -603,7 +603,7 @@ public void updateCGroupParam(CGroupController controller, String cGroupId,
|
||||
public String getCGroupParam(CGroupController controller, String cGroupId,
|
||||
String param) throws ResourceHandlerException {
|
||||
String cGroupParamPath =
|
||||
param.equals(CGROUP_FILE_TASKS) ?
|
||||
param.equals(CGROUP_PROCS_FILE) ?
|
||||
getPathForCGroup(controller, cGroupId)
|
||||
+ Path.SEPARATOR + param :
|
||||
getPathForCGroupParam(controller, cGroupId, param);
|
||||
|
@ -34,7 +34,7 @@
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PROCS_FILE;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_USAGE_BYTES;
|
||||
@ -116,8 +116,10 @@ private boolean isContainerOutOfLimit(Container container) {
|
||||
* Currently the killing only succeeds for PGIDS.
|
||||
*
|
||||
* @param container Container to clean up
|
||||
* @return true if the container is killed successfully, false otherwise
|
||||
*/
|
||||
private void sigKill(Container container) {
|
||||
private boolean sigKill(Container container) {
|
||||
boolean containerKilled = false;
|
||||
boolean finished = false;
|
||||
try {
|
||||
while (!finished) {
|
||||
@ -125,7 +127,7 @@ private void sigKill(Container container) {
|
||||
cgroups.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
container.getContainerId().toString(),
|
||||
CGROUP_FILE_TASKS)
|
||||
CGROUP_PROCS_FILE)
|
||||
.split("\n");
|
||||
finished = true;
|
||||
for (String pid : pids) {
|
||||
@ -154,11 +156,17 @@ private void sigKill(Container container) {
|
||||
LOG.debug("Interrupted while waiting for processes to disappear");
|
||||
}
|
||||
}
|
||||
containerKilled = true;
|
||||
} catch (ResourceHandlerException ex) {
|
||||
// the tasks file of the container may not be available because the
|
||||
// container may not have been launched at this point when the root
|
||||
// cgroup is under oom
|
||||
LOG.warn(String.format(
|
||||
"Cannot list more tasks in container %s to kill.",
|
||||
container.getContainerId()));
|
||||
}
|
||||
|
||||
return containerKilled;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -216,19 +224,34 @@ protected boolean killContainer() {
|
||||
|
||||
ArrayList<ContainerCandidate> candidates = new ArrayList<>(0);
|
||||
for (Container container : context.getContainers().values()) {
|
||||
if (!container.isRunning()) {
|
||||
// skip containers that are not running yet because killing them
|
||||
// won't release any memory to get us out of OOM.
|
||||
continue;
|
||||
// note even if it is indicated that the container is running from
|
||||
// container.isRunning(), the container process might not have been
|
||||
// running yet. From NM's perspective, a container is running as
|
||||
// soon as the container launch is handed over the container executor
|
||||
}
|
||||
candidates.add(
|
||||
new ContainerCandidate(container, isContainerOutOfLimit(container)));
|
||||
}
|
||||
Collections.sort(candidates);
|
||||
if (candidates.isEmpty()) {
|
||||
LOG.warn(
|
||||
"Found no running containers to kill in order to release memory");
|
||||
}
|
||||
|
||||
if (candidates.size() > 0) {
|
||||
ContainerCandidate candidate = candidates.get(0);
|
||||
sigKill(candidate.container);
|
||||
String message = String.format(
|
||||
"container %s killed by elastic cgroups OOM handler.",
|
||||
candidate.container.getContainerId());
|
||||
LOG.warn(message);
|
||||
containerKilled = true;
|
||||
// make sure one container is killed successfully to release memory
|
||||
for(int i = 0; !containerKilled && i < candidates.size(); i++) {
|
||||
ContainerCandidate candidate = candidates.get(i);
|
||||
if (sigKill(candidate.container)) {
|
||||
String message = String.format(
|
||||
"container %s killed by elastic cgroups OOM handler.",
|
||||
candidate.container.getContainerId());
|
||||
LOG.warn(message);
|
||||
containerKilled = true;
|
||||
}
|
||||
}
|
||||
return containerKilled;
|
||||
}
|
||||
|
@ -266,7 +266,7 @@ public void testCGroupPaths() throws IOException {
|
||||
Assert.assertEquals(expectedPath, path);
|
||||
|
||||
String expectedPathTasks = expectedPath + Path.SEPARATOR
|
||||
+ CGroupsHandler.CGROUP_FILE_TASKS;
|
||||
+ CGroupsHandler.CGROUP_PROCS_FILE;
|
||||
path = cGroupsHandler.getPathForCGroupTasks(controller, testCGroup);
|
||||
Assert.assertEquals(expectedPathTasks, path);
|
||||
|
||||
|
@ -33,7 +33,7 @@
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PROCS_FILE;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_USAGE_BYTES;
|
||||
@ -75,16 +75,49 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two guaranteed containers, both of which are out of limit.
|
||||
* Test an OOM situation where there are no running containers that
|
||||
* can be killed.
|
||||
*/
|
||||
@Test(expected = YarnRuntimeException.class)
|
||||
public void testExceptionThrownWithNoRunningContainersToKill()
|
||||
throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, true, 1L, false);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
|
||||
Context context = mock(Context.class);
|
||||
when(context.getContainers()).thenReturn(containers);
|
||||
|
||||
CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
|
||||
when(cGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
"",
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
|
||||
DefaultOOMHandler handler = new DefaultOOMHandler(context, false) {
|
||||
@Override
|
||||
protected CGroupsHandler getCGroupsHandler() {
|
||||
return cGroupsHandler;
|
||||
}
|
||||
};
|
||||
|
||||
handler.run();
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two running guaranteed containers, both of which are out of limit.
|
||||
* We should kill the later one.
|
||||
*/
|
||||
@Test
|
||||
public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception {
|
||||
public void testBothRunningGuaranteedContainersOverLimitUponOOM()
|
||||
throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, true, 1L);
|
||||
Container c1 = createContainer(1, true, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, true, 2L);
|
||||
Container c2 = createContainer(2, true, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -100,7 +133,7 @@ public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception {
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -109,7 +142,7 @@ public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(11));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -139,7 +172,7 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two GUARANTEED containers, one of which is out of limit.
|
||||
* We have two running GUARANTEED containers, one of which is out of limit.
|
||||
* We should kill the one that's out of its limit. This should
|
||||
* happen even if it was launched earlier than the other one.
|
||||
*/
|
||||
@ -147,9 +180,9 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, true, 2L);
|
||||
Container c1 = createContainer(1, true, 2L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, true, 1L);
|
||||
Container c2 = createContainer(2, true, 1L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -164,7 +197,7 @@ public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception {
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -175,7 +208,7 @@ public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception {
|
||||
|
||||
// container c2 is out of its limit
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -204,16 +237,16 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two GUARANTEE containers, neither of which is out of limit.
|
||||
* We have two running GUARANTEE containers, neither of which is out of limit.
|
||||
* We should kill the later launched one.
|
||||
*/
|
||||
@Test
|
||||
public void testNoGuaranteedContainerOverLimitOOM() throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, true, 1L);
|
||||
Container c1 = createContainer(1, true, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, true, 2L);
|
||||
Container c2 = createContainer(2, true, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -228,7 +261,7 @@ public void testNoGuaranteedContainerOverLimitOOM() throws Exception {
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -237,7 +270,7 @@ public void testNoGuaranteedContainerOverLimitOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -266,17 +299,250 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two opportunistic containers, both of which are out of limit.
|
||||
* We should kill the later one.
|
||||
* We have two OPPORTUNISTIC containers, one running and the other not.
|
||||
* We should kill the running one.
|
||||
*/
|
||||
@Test
|
||||
public void testKillOnlyRunningContainersUponOOM() throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 1L, false);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
Context context = mock(Context.class);
|
||||
when(context.getContainers()).thenReturn(containers);
|
||||
when(context.getContainerExecutor()).thenReturn(ex);
|
||||
|
||||
CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
|
||||
when(cGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
"",
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
|
||||
DefaultOOMHandler handler =
|
||||
new DefaultOOMHandler(context, false) {
|
||||
@Override
|
||||
protected CGroupsHandler getCGroupsHandler() {
|
||||
return cGroupsHandler;
|
||||
}
|
||||
};
|
||||
handler.run();
|
||||
|
||||
verify(ex, times(1)).signalContainer(
|
||||
new ContainerSignalContext.Builder()
|
||||
.setPid("1235")
|
||||
.setContainer(c2)
|
||||
.setSignal(ContainerExecutor.Signal.KILL)
|
||||
.build()
|
||||
);
|
||||
verify(ex, times(1)).signalContainer(any());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* We have two 'running' OPPORTUNISTIC containers. Killing the most-
|
||||
* recently launched one fails because its cgroup.procs file is not
|
||||
* available. The other OPPORTUNISTIC containers should be killed in
|
||||
* this case.
|
||||
*/
|
||||
@Test
|
||||
public void testKillOpportunisticContainerWithKillFailuresUponOOM()
|
||||
throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
Context context = mock(Context.class);
|
||||
when(context.getContainers()).thenReturn(containers);
|
||||
when(context.getContainerExecutor()).thenReturn(ex);
|
||||
|
||||
CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
|
||||
when(cGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
"",
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
// c2 process has not started, hence no cgroup.procs file yet
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenThrow(
|
||||
new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
|
||||
|
||||
DefaultOOMHandler handler =
|
||||
new DefaultOOMHandler(context, false) {
|
||||
@Override
|
||||
protected CGroupsHandler getCGroupsHandler() {
|
||||
return cGroupsHandler;
|
||||
}
|
||||
};
|
||||
handler.run();
|
||||
|
||||
verify(ex, times(1)).signalContainer(
|
||||
new ContainerSignalContext.Builder()
|
||||
.setPid("1235")
|
||||
.setContainer(c1)
|
||||
.setSignal(ContainerExecutor.Signal.KILL)
|
||||
.build()
|
||||
);
|
||||
verify(ex, times(1)).signalContainer(any());
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two 'running' OPPORTUNISTIC containers and one GUARANTEED
|
||||
* container. Killing two OPPORTUNISTIC containers fails because they
|
||||
* have not really started running as processes since the root cgroup
|
||||
* is under oom. We should try to kill one container successfully. In
|
||||
* this case, the GUARANTEED container should be killed.
|
||||
*/
|
||||
@Test
|
||||
public void testKillGuaranteedContainerWithKillFailuresUponOOM()
|
||||
throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(3, true, 2L, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
Context context = mock(Context.class);
|
||||
when(context.getContainers()).thenReturn(containers);
|
||||
when(context.getContainerExecutor()).thenReturn(ex);
|
||||
|
||||
CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
|
||||
when(cGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
"",
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
// c1 process has not started, hence no cgroup.procs file yet
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenThrow(
|
||||
new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
|
||||
// c2 process has not started, hence no cgroup.procs file yet
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenThrow(
|
||||
new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
|
||||
DefaultOOMHandler handler =
|
||||
new DefaultOOMHandler(context, false) {
|
||||
@Override
|
||||
protected CGroupsHandler getCGroupsHandler() {
|
||||
return cGroupsHandler;
|
||||
}
|
||||
};
|
||||
handler.run();
|
||||
|
||||
verify(ex, times(1)).signalContainer(
|
||||
new ContainerSignalContext.Builder()
|
||||
.setPid("1235")
|
||||
.setContainer(c3)
|
||||
.setSignal(ContainerExecutor.Signal.KILL)
|
||||
.build()
|
||||
);
|
||||
verify(ex, times(1)).signalContainer(any());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test an OOM situation where no containers are killed successfully.
|
||||
*
|
||||
* We have two 'running' containers, none of which are actually
|
||||
* running as processes. Their cgroup.procs file is not available,
|
||||
* so kill them won't succeed.
|
||||
*/
|
||||
@Test(expected = YarnRuntimeException.class)
|
||||
public void testExceptionThrownWhenNoContainersKilledSuccessfully()
|
||||
throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
Context context = mock(Context.class);
|
||||
when(context.getContainers()).thenReturn(containers);
|
||||
when(context.getContainerExecutor()).thenReturn(ex);
|
||||
|
||||
CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
|
||||
when(cGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
"",
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
// c1 process has not started, hence no cgroup.procs file yet
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenThrow(
|
||||
new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
|
||||
// c2 process has not started, hence no cgroup.procs file yet
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenThrow(
|
||||
new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
|
||||
|
||||
DefaultOOMHandler handler =
|
||||
new DefaultOOMHandler(context, false) {
|
||||
@Override
|
||||
protected CGroupsHandler getCGroupsHandler() {
|
||||
return cGroupsHandler;
|
||||
}
|
||||
};
|
||||
handler.run();
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two running opportunistic containers, both of which are out of
|
||||
* limit. We should kill the later one.
|
||||
*/
|
||||
@Test
|
||||
public void testBothOpportunisticContainersOverLimitUponOOM()
|
||||
throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 1L);
|
||||
Container c1 = createContainer(1, false, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 2L);
|
||||
Container c2 = createContainer(2, false, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -292,7 +558,7 @@ public void testBothOpportunisticContainersOverLimitUponOOM()
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -301,7 +567,7 @@ public void testBothOpportunisticContainersOverLimitUponOOM()
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(11));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -331,17 +597,17 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers, one of which is out of limit.
|
||||
* We should kill the one that's out of its limit. This should
|
||||
* We have two running OPPORTUNISTIC containers, one of which is out of
|
||||
* limit. We should kill the one that's out of its limit. This should
|
||||
* happen even if it was launched earlier than the other one.
|
||||
*/
|
||||
@Test
|
||||
public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 2L);
|
||||
Container c1 = createContainer(1, false, 2L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 1L);
|
||||
Container c2 = createContainer(2, false, 1L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -356,7 +622,7 @@ public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception {
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -366,7 +632,7 @@ public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception {
|
||||
.thenReturn(getMB(9));
|
||||
// contnainer c2 is out of its limit
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -395,16 +661,16 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers, neither of which is out of limit.
|
||||
* We should kill the later one.
|
||||
* We have two running OPPORTUNISTIC containers, neither of which is out of
|
||||
* limit. We should kill the later one.
|
||||
*/
|
||||
@Test
|
||||
public void testNoOpportunisticContainerOverLimitOOM() throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(1, false, 1L);
|
||||
Container c1 = createContainer(1, false, 1L, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(2, false, 2L);
|
||||
Container c2 = createContainer(2, false, 2L, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -419,7 +685,7 @@ public void testNoOpportunisticContainerOverLimitOOM() throws Exception {
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL))
|
||||
.thenReturn("under_oom 1").thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -428,7 +694,7 @@ public void testNoOpportunisticContainerOverLimitOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -457,8 +723,8 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers and one GUARANTEED container.
|
||||
* One of the OPPORTUNISTIC container is out of limit.
|
||||
* We have two running OPPORTUNISTIC containers and one running GUARANTEED
|
||||
* container. One of the OPPORTUNISTIC container is out of limit.
|
||||
* OOM is resolved after killing the OPPORTUNISTIC container that
|
||||
* exceeded its limit even though it is launched earlier than the
|
||||
* other OPPORTUNISTIC container.
|
||||
@ -469,11 +735,11 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM()
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
int currentContainerId = 0;
|
||||
Container c1 = createContainer(currentContainerId++, false, 2);
|
||||
Container c1 = createContainer(currentContainerId++, false, 2, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 1, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -489,7 +755,7 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM()
|
||||
.thenReturn("under_oom 1")
|
||||
.thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -500,7 +766,7 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM()
|
||||
|
||||
// container c2 is out of its limit
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -510,7 +776,7 @@ public void testKillOneOverLimitOpportunisticContainerUponOOM()
|
||||
.thenReturn(getMB(11));
|
||||
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1236").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -538,8 +804,8 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
verify(ex, times(1)).signalContainer(any());
|
||||
}
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers and one GUARANTEED container.
|
||||
* None of the containers exceeded its memory limit.
|
||||
* We have two running OPPORTUNISTIC containers and one running GUARANTEED
|
||||
* container. None of the containers exceeded its memory limit.
|
||||
* OOM is resolved after killing the most recently launched OPPORTUNISTIC
|
||||
* container.
|
||||
*/
|
||||
@ -548,11 +814,11 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception {
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
int currentContainerId = 0;
|
||||
Container c1 = createContainer(currentContainerId++, false, 1);
|
||||
Container c1 = createContainer(currentContainerId++, false, 1, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 2);
|
||||
Container c2 = createContainer(currentContainerId++, false, 2, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -568,7 +834,7 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception {
|
||||
.thenReturn("under_oom 1")
|
||||
.thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -577,7 +843,7 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -586,7 +852,7 @@ public void testKillOneLaterOpportunisticContainerUponOOM() throws Exception {
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1236").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -615,8 +881,8 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers and one GUARANTEED container.
|
||||
* One of the OPPORTUNISTIC container is out of limit.
|
||||
* We have two running OPPORTUNISTIC containers and one running GUARANTEED
|
||||
* container. One of the OPPORTUNISTIC container is out of limit.
|
||||
* OOM is resolved after killing both OPPORTUNISTIC containers.
|
||||
*/
|
||||
@Test
|
||||
@ -625,11 +891,11 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception {
|
||||
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(currentContainerId++, false, 2);
|
||||
Container c1 = createContainer(currentContainerId++, false, 2, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 1, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -646,7 +912,7 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception {
|
||||
.thenReturn("under_oom 1")
|
||||
.thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -655,7 +921,7 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -664,7 +930,7 @@ public void testKillBothOpportunisticContainerUponOOM() throws Exception {
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(11));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1236").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -701,8 +967,8 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers and one GUARANTEED container.
|
||||
* the GUARANTEED container is out of limit. OOM is resolved
|
||||
* We have two running OPPORTUNISTIC containers and one running GUARANTEED
|
||||
* container. The GUARANTEED container is out of limit. OOM is resolved
|
||||
* after first killing the two OPPORTUNISTIC containers and then the
|
||||
* GUARANTEED container.
|
||||
*/
|
||||
@ -712,11 +978,11 @@ public void testKillGuaranteedContainerUponOOM() throws Exception {
|
||||
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(currentContainerId++, false, 2);
|
||||
Container c1 = createContainer(currentContainerId++, false, 2, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 1, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -734,7 +1000,7 @@ public void testKillGuaranteedContainerUponOOM() throws Exception {
|
||||
.thenReturn("under_oom 1")
|
||||
.thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -743,7 +1009,7 @@ public void testKillGuaranteedContainerUponOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -752,7 +1018,7 @@ public void testKillGuaranteedContainerUponOOM() throws Exception {
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1236").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -795,8 +1061,8 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers and one GUARANTEED container.
|
||||
* None of the containers exceeded its memory limit.
|
||||
* We have two running OPPORTUNISTIC containers and one running GUARANTEED
|
||||
* container. None of the containers exceeded its memory limit.
|
||||
* OOM is resolved after killing all running containers.
|
||||
*/
|
||||
@Test
|
||||
@ -805,11 +1071,11 @@ public void testKillAllContainersUponOOM() throws Exception {
|
||||
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(currentContainerId++, false, 1);
|
||||
Container c1 = createContainer(currentContainerId++, false, 1, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 2);
|
||||
Container c2 = createContainer(currentContainerId++, false, 2, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1);
|
||||
Container c3 = createContainer(currentContainerId++, true, 1, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -827,7 +1093,7 @@ public void testKillAllContainersUponOOM() throws Exception {
|
||||
.thenReturn("under_oom 1")
|
||||
.thenReturn("under_oom 0");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -836,7 +1102,7 @@ public void testKillAllContainersUponOOM() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -845,7 +1111,7 @@ public void testKillAllContainersUponOOM() throws Exception {
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1236").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -888,7 +1154,8 @@ protected CGroupsHandler getCGroupsHandler() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We have two OPPORTUNISTIC containers and one GUARANTEED container.
|
||||
* We have two running OPPORTUNISTIC containers and one running
|
||||
* GUARANTEED container.
|
||||
* None of the containers exceeded its memory limit.
|
||||
* OOM is not resolved even after killing all running containers.
|
||||
* A YarnRuntimeException is excepted to be thrown.
|
||||
@ -899,11 +1166,11 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception {
|
||||
|
||||
ConcurrentHashMap<ContainerId, Container> containers =
|
||||
new ConcurrentHashMap<>();
|
||||
Container c1 = createContainer(currentContainerId++, false, 1);
|
||||
Container c1 = createContainer(currentContainerId++, false, 1, true);
|
||||
containers.put(c1.getContainerId(), c1);
|
||||
Container c2 = createContainer(currentContainerId++, false, 2);
|
||||
Container c2 = createContainer(currentContainerId++, false, 2, true);
|
||||
containers.put(c2.getContainerId(), c2);
|
||||
Container c3 = createContainer(currentContainerId++, true, 3);
|
||||
Container c3 = createContainer(currentContainerId++, true, 3, true);
|
||||
containers.put(c3.getContainerId(), c3);
|
||||
|
||||
ContainerExecutor ex = createContainerExecutor(containers);
|
||||
@ -921,7 +1188,7 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception {
|
||||
.thenReturn("under_oom 1")
|
||||
.thenReturn("under_oom 1");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c1.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1234").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -930,7 +1197,7 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception {
|
||||
c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c2.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1235").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -939,7 +1206,7 @@ public void testOOMUnresolvedAfterKillingAllContainers() throws Exception {
|
||||
c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
|
||||
.thenReturn(getMB(9));
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_FILE_TASKS))
|
||||
c3.getContainerId().toString(), CGROUP_PROCS_FILE))
|
||||
.thenReturn("1236").thenReturn("");
|
||||
when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
|
||||
c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
|
||||
@ -974,7 +1241,7 @@ private static ContainerId createContainerId(int id) {
|
||||
}
|
||||
|
||||
private static Container createContainer(int containerId,
|
||||
boolean guaranteed, long launchTime) {
|
||||
boolean guaranteed, long launchTime, boolean running) {
|
||||
Container c1 = mock(Container.class);
|
||||
ContainerId cid1 = createContainerId(containerId);
|
||||
when(c1.getContainerId()).thenReturn(cid1);
|
||||
@ -987,6 +1254,7 @@ private static Container createContainer(int containerId,
|
||||
|
||||
when(c1.getResource()).thenReturn(Resource.newInstance(10, 1));
|
||||
when(c1.getContainerLaunchTime()).thenReturn(launchTime);
|
||||
when(c1.isRunning()).thenReturn(running);
|
||||
|
||||
return c1;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user