From 8d3fd81980275fa81e7a5539b1751f38a63b6911 Mon Sep 17 00:00:00 2001 From: Arun Suresh Date: Mon, 7 Aug 2017 18:59:25 -0700 Subject: [PATCH] YARN-6920. Fix resource leak that happens during container re-initialization. (asuresh) --- .../yarn/client/api/impl/TestNMClient.java | 37 +++++++++---------- .../container/ContainerImpl.java | 4 ++ .../scheduler/ContainerScheduler.java | 4 ++ .../TestContainerManager.java | 9 +++++ 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java index 1034f7eacc..6bd0816206 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java @@ -398,6 +398,8 @@ private void testContainerManagement(NMClientImpl nmClient, "will be Rolled-back", Arrays.asList(new Integer[] {-1000})); testCommitContainer(container.getId(), true); testReInitializeContainer(container.getId(), clc, false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); testCommitContainer(container.getId(), false); } else { testReInitializeContainer(container.getId(), clc, true); @@ -449,24 +451,21 @@ private void testGetContainerStatus(Container container, int index, ContainerState state, String diagnostics, List exitStatuses) throws YarnException, IOException { while (true) { - try { - ContainerStatus status = nmClient.getContainerStatus( - container.getId(), container.getNodeId()); - // NodeManager may still need some time to get the stable - // container status - if (status.getState() == state) { - assertEquals(container.getId(), status.getContainerId()); - assertTrue("" + index + ": " + status.getDiagnostics(), - status.getDiagnostics().contains(diagnostics)); - - assertTrue("Exit Statuses are supposed to be in: " + exitStatuses + - ", but the actual exit status code is: " + status.getExitStatus(), - exitStatuses.contains(status.getExitStatus())); - break; - } - Thread.sleep(100); - } catch (InterruptedException e) { - e.printStackTrace(); + sleep(250); + ContainerStatus status = nmClient.getContainerStatus( + container.getId(), container.getNodeId()); + // NodeManager may still need some time to get the stable + // container status + if (status.getState() == state) { + assertEquals(container.getId(), status.getContainerId()); + assertTrue("" + index + ": " + status.getDiagnostics(), + status.getDiagnostics().contains(diagnostics)); + + assertTrue("Exit Statuses are supposed to be in: " + exitStatuses + + ", but the actual exit status code is: " + + status.getExitStatus(), + exitStatuses.contains(status.getExitStatus())); + break; } } } @@ -559,9 +558,7 @@ private void testReInitializeContainer(ContainerId containerId, ContainerLaunchContext clc, boolean autoCommit) throws YarnException, IOException { try { - sleep(250); nmClient.reInitializeContainer(containerId, clc, autoCommit); - sleep(250); } catch (YarnException e) { // NM container will only be in SCHEDULED state, so expect the increase // action to fail. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 46f8fa091f..c0aa6b0d63 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1397,6 +1397,10 @@ public void transition(ContainerImpl container, container.resourceSet = container.reInitContext.mergedResourceSet(container.resourceSet); container.isMarkeForKilling = false; + // Ensure Resources are decremented. + container.dispatcher.getEventHandler().handle( + new ContainerSchedulerEvent(container, + ContainerSchedulerEventType.CONTAINER_COMPLETED)); container.sendScheduleEvent(); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java index c119bf28f9..60d6213d9a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java @@ -466,4 +466,8 @@ public ContainersMonitor getContainersMonitor() { return this.context.getContainerManager().getContainersMonitor(); } + @VisibleForTesting + public ResourceUtilization getCurrentUtilization() { + return this.utilizationTracker.getCurrentUtilization(); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java index f2d20377fc..24d46b6df8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java @@ -74,6 +74,7 @@ import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceUtilization; import org.apache.hadoop.yarn.api.records.SerializedException; import org.apache.hadoop.yarn.api.records.SignalContainerCommand; import org.apache.hadoop.yarn.api.records.Token; @@ -437,7 +438,15 @@ private String[] testContainerReInitSuccess(boolean autoCommit) File newStartFile = new File(tmpDir, "start_file_n.txt").getAbsoluteFile(); + ResourceUtilization beforeUpgrade = + ResourceUtilization.newInstance( + containerManager.getContainerScheduler().getCurrentUtilization()); prepareContainerUpgrade(autoCommit, false, false, cId, newStartFile); + ResourceUtilization afterUpgrade = + ResourceUtilization.newInstance( + containerManager.getContainerScheduler().getCurrentUtilization()); + Assert.assertEquals("Possible resource leak detected !!", + beforeUpgrade, afterUpgrade); // Assert that the First process is not alive anymore Assert.assertFalse("Process is still alive!",