From 3344ba70e027c929e07bad5e6877c796d41181e9 Mon Sep 17 00:00:00 2001 From: Arun Suresh Date: Wed, 8 Jun 2016 08:31:32 -0700 Subject: [PATCH] YARN-5204. Properly report status of killed/stopped queued containers. (Konstantinos Karanasos via asuresh) --- .../queuing/QueuingContainerManagerImpl.java | 15 +- .../queuing/TestQueuingContainerManager.java | 133 ++++++++++++++---- 2 files changed, 117 insertions(+), 31 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/QueuingContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/QueuingContainerManagerImpl.java index a1e3bdb20a..38b1b07287 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/QueuingContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/QueuingContainerManagerImpl.java @@ -175,8 +175,9 @@ protected void stopContainerInternal(ContainerId containerID) } nodeStatusUpdater.sendOutofBandHeartBeat(); + } else { + super.stopContainerInternal(containerID); } - super.stopContainerInternal(containerID); } /** @@ -456,6 +457,18 @@ protected ContainerStatus getContainerStatusInternal(ContainerId containerID, ContainerExitStatus.INVALID, this.context.getQueuingContext() .getQueuedContainers().get(containerID).getResource(), executionType); + } else { + // Check if part of the stopped/killed queued containers. + for (ContainerTokenIdentifier cTokenId : this.context + .getQueuingContext().getKilledQueuedContainers().keySet()) { + if (cTokenId.getContainerID().equals(containerID)) { + return BuilderUtils.newContainerStatus(containerID, + org.apache.hadoop.yarn.api.records.ContainerState.COMPLETE, + this.context.getQueuingContext().getKilledQueuedContainers() + .get(cTokenId), ContainerExitStatus.ABORTED, cTokenId + .getResource(), cTokenId.getExecutionType()); + } + } } } return super.getContainerStatusInternal(containerID, nmTokenIdentifier); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/TestQueuingContainerManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/TestQueuingContainerManager.java index 4d44d8d85d..caebef7354 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/TestQueuingContainerManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/queuing/TestQueuingContainerManager.java @@ -24,13 +24,13 @@ import java.util.List; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest; import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest; +import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -41,15 +41,12 @@ import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.security.NMTokenIdentifier; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; -import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.MockResourceCalculatorPlugin; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.MockResourceCalculatorProcessTree; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.junit.Assert; import org.junit.Test; @@ -58,11 +55,6 @@ * Class for testing the {@link QueuingContainerManagerImpl}. */ public class TestQueuingContainerManager extends BaseContainerManagerTest { - - interface HasResources { - boolean decide(Context context, ContainerId cId); - } - public TestQueuingContainerManager() throws UnsupportedFileSystemException { super(); } @@ -78,18 +70,6 @@ protected ContainerManagerImpl createContainerManager( DeletionService delSrvc) { return new QueuingContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater, metrics, dirsHandler) { - - @Override - public void serviceInit(Configuration conf) throws Exception { - conf.set( - YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR, - MockResourceCalculatorPlugin.class.getCanonicalName()); - conf.set( - YarnConfiguration.NM_CONTAINER_MON_PROCESS_TREE, - MockResourceCalculatorProcessTree.class.getCanonicalName()); - super.serviceInit(conf); - } - @Override public void setBlockNewContainerRequests(boolean blockNewContainerRequests) { @@ -398,7 +378,7 @@ public void testKillOpportunisticForGuaranteedContainer() throws Exception { containerManager.startContainers(allRequests); BaseContainerManagerTest.waitForNMContainerState(containerManager, - createContainerId(0), ContainerState.DONE, 30); + createContainerId(0), ContainerState.DONE, 40); Thread.sleep(5000); // Get container statuses. Container 0 should be killed, container 1 @@ -429,7 +409,7 @@ public void testKillOpportunisticForGuaranteedContainer() throws Exception { // Make sure the remaining OPPORTUNISTIC container starts its execution. BaseContainerManagerTest.waitForNMContainerState(containerManager, - createContainerId(2), ContainerState.DONE, 30); + createContainerId(2), ContainerState.DONE, 40); Thread.sleep(5000); statRequest = GetContainerStatusesRequest.newInstance(Arrays.asList( createContainerId(1))); @@ -488,13 +468,12 @@ public void testKillMultipleOpportunisticContainers() throws Exception { containerManager.startContainers(allRequests); BaseContainerManagerTest.waitForNMContainerState(containerManager, - createContainerId(0), ContainerState.DONE, 30); + createContainerId(0), ContainerState.DONE, 40); Thread.sleep(5000); // Get container statuses. Container 0 should be killed, container 1 // should be queued and container 2 should be running. int killedContainers = 0; - int runningContainers = 0; List statList = new ArrayList(); for (int i = 0; i < 4; i++) { statList.add(createContainerId(i)); @@ -508,14 +487,108 @@ public void testKillMultipleOpportunisticContainers() throws Exception { "Container killed by the ApplicationMaster")) { killedContainers++; } - if (status.getState() == - org.apache.hadoop.yarn.api.records.ContainerState.RUNNING) { - runningContainers++; - } System.out.println("\nStatus : [" + status + "]\n"); } Assert.assertEquals(2, killedContainers); - Assert.assertEquals(2, runningContainers); + } + + /** + * Start running one GUARANTEED container and queue two OPPORTUNISTIC ones. + * Try killing one of the two queued containers. + * @throws Exception + */ + @Test + public void testStopQueuedContainer() throws Exception { + shouldDeleteWait = true; + containerManager.start(); + + ContainerLaunchContext containerLaunchContext = + recordFactory.newRecordInstance(ContainerLaunchContext.class); + + List list = new ArrayList<>(); + list.add(StartContainerRequest.newInstance( + containerLaunchContext, + createContainerToken(createContainerId(0), DUMMY_RM_IDENTIFIER, + context.getNodeId(), + user, BuilderUtils.newResource(2048, 1), + context.getContainerTokenSecretManager(), null, + ExecutionType.GUARANTEED))); + list.add(StartContainerRequest.newInstance( + containerLaunchContext, + createContainerToken(createContainerId(1), DUMMY_RM_IDENTIFIER, + context.getNodeId(), + user, BuilderUtils.newResource(512, 1), + context.getContainerTokenSecretManager(), null, + ExecutionType.OPPORTUNISTIC))); + list.add(StartContainerRequest.newInstance( + containerLaunchContext, + createContainerToken(createContainerId(2), DUMMY_RM_IDENTIFIER, + context.getNodeId(), + user, BuilderUtils.newResource(512, 1), + context.getContainerTokenSecretManager(), null, + ExecutionType.OPPORTUNISTIC))); + + StartContainersRequest allRequests = + StartContainersRequest.newInstance(list); + containerManager.startContainers(allRequests); + + Thread.sleep(2000); + + // Assert there is initially one container running and two queued. + int runningContainersNo = 0; + int queuedContainersNo = 0; + List statList = new ArrayList(); + for (int i = 0; i < 3; i++) { + statList.add(createContainerId(i)); + } + GetContainerStatusesRequest statRequest = GetContainerStatusesRequest + .newInstance(statList); + List containerStatuses = containerManager + .getContainerStatuses(statRequest).getContainerStatuses(); + for (ContainerStatus status : containerStatuses) { + if (status.getState() == + org.apache.hadoop.yarn.api.records.ContainerState.RUNNING) { + runningContainersNo++; + } else if (status.getState() == + org.apache.hadoop.yarn.api.records.ContainerState.QUEUED) { + queuedContainersNo++; + } + System.out.println("\nStatus : [" + status + "]\n"); + } + + Assert.assertEquals(1, runningContainersNo); + Assert.assertEquals(2, queuedContainersNo); + + // Stop one of the two queued containers. + StopContainersRequest stopRequest = StopContainersRequest. + newInstance(Arrays.asList(createContainerId(1))); + containerManager.stopContainers(stopRequest); + + Thread.sleep(2000); + + // Assert queued container got properly stopped. + statList.clear(); + for (int i = 0; i < 3; i++) { + statList.add(createContainerId(i)); + } + statRequest = GetContainerStatusesRequest.newInstance(statList); + containerStatuses = containerManager.getContainerStatuses(statRequest) + .getContainerStatuses(); + for (ContainerStatus status : containerStatuses) { + if (status.getContainerId().equals(createContainerId(0))) { + Assert.assertEquals( + org.apache.hadoop.yarn.api.records.ContainerState.RUNNING, + status.getState()); + } else if (status.getContainerId().equals(createContainerId(1))) { + Assert.assertTrue(status.getDiagnostics().contains( + "Queued container request removed")); + } else if (status.getContainerId().equals(createContainerId(2))) { + Assert.assertEquals( + org.apache.hadoop.yarn.api.records.ContainerState.QUEUED, + status.getState()); + } + System.out.println("\nStatus : [" + status + "]\n"); + } } }