YARN-6920. Fix resource leak that happens during container re-initialization. (asuresh)
This commit is contained in:
parent
c61f2c4198
commit
8d3fd81980
@ -398,6 +398,8 @@ private void testContainerManagement(NMClientImpl nmClient,
|
|||||||
"will be Rolled-back", Arrays.asList(new Integer[] {-1000}));
|
"will be Rolled-back", Arrays.asList(new Integer[] {-1000}));
|
||||||
testCommitContainer(container.getId(), true);
|
testCommitContainer(container.getId(), true);
|
||||||
testReInitializeContainer(container.getId(), clc, false);
|
testReInitializeContainer(container.getId(), clc, false);
|
||||||
|
testGetContainerStatus(container, i, ContainerState.RUNNING,
|
||||||
|
"will be Re-initialized", Arrays.asList(new Integer[] {-1000}));
|
||||||
testCommitContainer(container.getId(), false);
|
testCommitContainer(container.getId(), false);
|
||||||
} else {
|
} else {
|
||||||
testReInitializeContainer(container.getId(), clc, true);
|
testReInitializeContainer(container.getId(), clc, true);
|
||||||
@ -449,7 +451,7 @@ private void testGetContainerStatus(Container container, int index,
|
|||||||
ContainerState state, String diagnostics, List<Integer> exitStatuses)
|
ContainerState state, String diagnostics, List<Integer> exitStatuses)
|
||||||
throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
sleep(250);
|
||||||
ContainerStatus status = nmClient.getContainerStatus(
|
ContainerStatus status = nmClient.getContainerStatus(
|
||||||
container.getId(), container.getNodeId());
|
container.getId(), container.getNodeId());
|
||||||
// NodeManager may still need some time to get the stable
|
// NodeManager may still need some time to get the stable
|
||||||
@ -460,14 +462,11 @@ private void testGetContainerStatus(Container container, int index,
|
|||||||
status.getDiagnostics().contains(diagnostics));
|
status.getDiagnostics().contains(diagnostics));
|
||||||
|
|
||||||
assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
|
assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
|
||||||
", but the actual exit status code is: " + status.getExitStatus(),
|
", but the actual exit status code is: " +
|
||||||
|
status.getExitStatus(),
|
||||||
exitStatuses.contains(status.getExitStatus()));
|
exitStatuses.contains(status.getExitStatus()));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
Thread.sleep(100);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -559,9 +558,7 @@ private void testReInitializeContainer(ContainerId containerId,
|
|||||||
ContainerLaunchContext clc, boolean autoCommit)
|
ContainerLaunchContext clc, boolean autoCommit)
|
||||||
throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
try {
|
try {
|
||||||
sleep(250);
|
|
||||||
nmClient.reInitializeContainer(containerId, clc, autoCommit);
|
nmClient.reInitializeContainer(containerId, clc, autoCommit);
|
||||||
sleep(250);
|
|
||||||
} catch (YarnException e) {
|
} catch (YarnException e) {
|
||||||
// NM container will only be in SCHEDULED state, so expect the increase
|
// NM container will only be in SCHEDULED state, so expect the increase
|
||||||
// action to fail.
|
// action to fail.
|
||||||
|
@ -1397,6 +1397,10 @@ public void transition(ContainerImpl container,
|
|||||||
container.resourceSet =
|
container.resourceSet =
|
||||||
container.reInitContext.mergedResourceSet(container.resourceSet);
|
container.reInitContext.mergedResourceSet(container.resourceSet);
|
||||||
container.isMarkeForKilling = false;
|
container.isMarkeForKilling = false;
|
||||||
|
// Ensure Resources are decremented.
|
||||||
|
container.dispatcher.getEventHandler().handle(
|
||||||
|
new ContainerSchedulerEvent(container,
|
||||||
|
ContainerSchedulerEventType.CONTAINER_COMPLETED));
|
||||||
container.sendScheduleEvent();
|
container.sendScheduleEvent();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -466,4 +466,8 @@ public ContainersMonitor getContainersMonitor() {
|
|||||||
return this.context.getContainerManager().getContainersMonitor();
|
return this.context.getContainerManager().getContainersMonitor();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public ResourceUtilization getCurrentUtilization() {
|
||||||
|
return this.utilizationTracker.getCurrentUtilization();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -74,6 +74,7 @@
|
|||||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
|
||||||
import org.apache.hadoop.yarn.api.records.SerializedException;
|
import org.apache.hadoop.yarn.api.records.SerializedException;
|
||||||
import org.apache.hadoop.yarn.api.records.SignalContainerCommand;
|
import org.apache.hadoop.yarn.api.records.SignalContainerCommand;
|
||||||
import org.apache.hadoop.yarn.api.records.Token;
|
import org.apache.hadoop.yarn.api.records.Token;
|
||||||
@ -437,7 +438,15 @@ private String[] testContainerReInitSuccess(boolean autoCommit)
|
|||||||
|
|
||||||
File newStartFile = new File(tmpDir, "start_file_n.txt").getAbsoluteFile();
|
File newStartFile = new File(tmpDir, "start_file_n.txt").getAbsoluteFile();
|
||||||
|
|
||||||
|
ResourceUtilization beforeUpgrade =
|
||||||
|
ResourceUtilization.newInstance(
|
||||||
|
containerManager.getContainerScheduler().getCurrentUtilization());
|
||||||
prepareContainerUpgrade(autoCommit, false, false, cId, newStartFile);
|
prepareContainerUpgrade(autoCommit, false, false, cId, newStartFile);
|
||||||
|
ResourceUtilization afterUpgrade =
|
||||||
|
ResourceUtilization.newInstance(
|
||||||
|
containerManager.getContainerScheduler().getCurrentUtilization());
|
||||||
|
Assert.assertEquals("Possible resource leak detected !!",
|
||||||
|
beforeUpgrade, afterUpgrade);
|
||||||
|
|
||||||
// Assert that the First process is not alive anymore
|
// Assert that the First process is not alive anymore
|
||||||
Assert.assertFalse("Process is still alive!",
|
Assert.assertFalse("Process is still alive!",
|
||||||
|
Loading…
Reference in New Issue
Block a user