YARN-6920. Fix resource leak that happens during container re-initialization. (asuresh)

This commit is contained in:
Arun Suresh 2017-08-07 18:59:25 -07:00
parent c61f2c4198
commit 8d3fd81980
4 changed files with 34 additions and 20 deletions

View File

@ -398,6 +398,8 @@ private void testContainerManagement(NMClientImpl nmClient,
"will be Rolled-back", Arrays.asList(new Integer[] {-1000}));
testCommitContainer(container.getId(), true);
testReInitializeContainer(container.getId(), clc, false);
testGetContainerStatus(container, i, ContainerState.RUNNING,
"will be Re-initialized", Arrays.asList(new Integer[] {-1000}));
testCommitContainer(container.getId(), false);
} else {
testReInitializeContainer(container.getId(), clc, true);
@ -449,24 +451,21 @@ private void testGetContainerStatus(Container container, int index,
ContainerState state, String diagnostics, List<Integer> exitStatuses)
throws YarnException, IOException {
while (true) {
try {
ContainerStatus status = nmClient.getContainerStatus(
container.getId(), container.getNodeId());
// NodeManager may still need some time to get the stable
// container status
if (status.getState() == state) {
assertEquals(container.getId(), status.getContainerId());
assertTrue("" + index + ": " + status.getDiagnostics(),
status.getDiagnostics().contains(diagnostics));
assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
", but the actual exit status code is: " + status.getExitStatus(),
exitStatuses.contains(status.getExitStatus()));
break;
}
Thread.sleep(100);
} catch (InterruptedException e) {
e.printStackTrace();
sleep(250);
ContainerStatus status = nmClient.getContainerStatus(
container.getId(), container.getNodeId());
// NodeManager may still need some time to get the stable
// container status
if (status.getState() == state) {
assertEquals(container.getId(), status.getContainerId());
assertTrue("" + index + ": " + status.getDiagnostics(),
status.getDiagnostics().contains(diagnostics));
assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
", but the actual exit status code is: " +
status.getExitStatus(),
exitStatuses.contains(status.getExitStatus()));
break;
}
}
}
@ -559,9 +558,7 @@ private void testReInitializeContainer(ContainerId containerId,
ContainerLaunchContext clc, boolean autoCommit)
throws YarnException, IOException {
try {
sleep(250);
nmClient.reInitializeContainer(containerId, clc, autoCommit);
sleep(250);
} catch (YarnException e) {
// NM container will only be in SCHEDULED state, so expect the increase
// action to fail.

View File

@ -1397,6 +1397,10 @@ public void transition(ContainerImpl container,
container.resourceSet =
container.reInitContext.mergedResourceSet(container.resourceSet);
container.isMarkeForKilling = false;
// Ensure Resources are decremented.
container.dispatcher.getEventHandler().handle(
new ContainerSchedulerEvent(container,
ContainerSchedulerEventType.CONTAINER_COMPLETED));
container.sendScheduleEvent();
}
}

View File

@ -466,4 +466,8 @@ public ContainersMonitor getContainersMonitor() {
return this.context.getContainerManager().getContainersMonitor();
}
@VisibleForTesting
public ResourceUtilization getCurrentUtilization() {
return this.utilizationTracker.getCurrentUtilization();
}
}

View File

@ -74,6 +74,7 @@
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
import org.apache.hadoop.yarn.api.records.SerializedException;
import org.apache.hadoop.yarn.api.records.SignalContainerCommand;
import org.apache.hadoop.yarn.api.records.Token;
@ -437,7 +438,15 @@ private String[] testContainerReInitSuccess(boolean autoCommit)
File newStartFile = new File(tmpDir, "start_file_n.txt").getAbsoluteFile();
ResourceUtilization beforeUpgrade =
ResourceUtilization.newInstance(
containerManager.getContainerScheduler().getCurrentUtilization());
prepareContainerUpgrade(autoCommit, false, false, cId, newStartFile);
ResourceUtilization afterUpgrade =
ResourceUtilization.newInstance(
containerManager.getContainerScheduler().getCurrentUtilization());
Assert.assertEquals("Possible resource leak detected !!",
beforeUpgrade, afterUpgrade);
// Assert that the First process is not alive anymore
Assert.assertFalse("Process is still alive!",