From 44f7ee7192d150b43a3012f06cec67be5b64edd0 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Sun, 6 Oct 2013 18:32:16 +0000 Subject: [PATCH] YARN-1278. Fixed NodeManager to not delete local resources for apps on resync command from RM - a bug caused by YARN-1149. Contributed by Hitesh Shah. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1529657 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 + .../CMgrCompletedContainersEvent.java | 30 +++++++- .../yarn/server/nodemanager/NodeManager.java | 3 +- .../nodemanager/NodeStatusUpdaterImpl.java | 3 +- .../ContainerManagerImpl.java | 77 +++++++++++-------- .../nodemanager/TestNodeManagerResync.java | 7 +- .../nodemanager/TestNodeStatusUpdater.java | 20 ++--- 7 files changed, 94 insertions(+), 49 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index dff651395a..2aa8c55659 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -188,6 +188,9 @@ Release 2.1.2 - UNRELEASED and log-dirs correctly even when there are no resources to localize for the container. (Siddharth Seth via vinodkv) + YARN-1278. Fixed NodeManager to not delete local resources for apps on resync + command from RM - a bug caused by YARN-1149. (Hitesh Shah via vinodkv) + Release 2.1.1-beta - 2013-09-23 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java index e5e55374af..5f7d01ea2b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java @@ -25,13 +25,39 @@ public class CMgrCompletedContainersEvent extends ContainerManagerEvent { private final List containerToCleanup; - - public CMgrCompletedContainersEvent(List containersToCleanup) { + private final Reason reason; + + public CMgrCompletedContainersEvent(List containersToCleanup, + Reason reason) { super(ContainerManagerEventType.FINISH_CONTAINERS); this.containerToCleanup = containersToCleanup; + this.reason = reason; } public List getContainersToCleanup() { return this.containerToCleanup; } + + public Reason getReason() { + return reason; + } + + public static enum Reason { + /** + * Container is killed as NodeManager is shutting down + */ + ON_SHUTDOWN, + + /** + * Container is killed as the Nodemanager is re-syncing with the + * ResourceManager + */ + ON_NODEMANAGER_RESYNC, + + /** + * Container is killed on request by the ResourceManager + */ + BY_RESOURCEMANAGER + } + } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 79b9d7a83f..998fca775d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -226,7 +226,8 @@ protected void resyncWithRM() { public void run() { LOG.info("Notifying ContainerManager to block new container-requests"); containerManager.setBlockNewContainerRequests(true); - containerManager.cleanUpApplications(NodeManagerEventType.RESYNC); + LOG.info("Cleaning up running containers on resync"); + containerManager.cleanupContainersOnNMResync(); ((NodeStatusUpdaterImpl) nodeStatusUpdater ).rebootNodeStatusUpdater(); } }.start(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index b52f9d1650..d6af3fe7bd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -501,7 +501,8 @@ public void run() { .getContainersToCleanup(); if (!containersToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( - new CMgrCompletedContainersEvent(containersToCleanup)); + new CMgrCompletedContainersEvent(containersToCleanup, + CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER)); } List appsToCleanup = response.getApplicationsToCleanup(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index d158b43f2d..3091c4adb6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -87,7 +87,6 @@ import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger; -import org.apache.hadoop.yarn.server.nodemanager.NodeManagerEventType; import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; @@ -306,7 +305,7 @@ public void serviceStop() throws Exception { try { serviceStopped = true; if (context != null) { - cleanUpApplications(NodeManagerEventType.SHUTDOWN); + cleanUpApplicationsOnNMShutDown(); } } finally { this.writeLock.unlock(); @@ -320,7 +319,7 @@ public void serviceStop() throws Exception { super.serviceStop(); } - public void cleanUpApplications(NodeManagerEventType eventType) { + public void cleanUpApplicationsOnNMShutDown() { Map applications = this.context.getApplications(); if (applications.isEmpty()) { @@ -336,33 +335,15 @@ public void cleanUpApplications(NodeManagerEventType eventType) { LOG.info("Waiting for Applications to be Finished"); - switch (eventType) { - case SHUTDOWN: - long waitStartTime = System.currentTimeMillis(); - while (!applications.isEmpty() - && System.currentTimeMillis() - waitStartTime - < waitForContainersOnShutdownMillis) { - try { - Thread.sleep(1000); - } catch (InterruptedException ex) { - LOG.warn("Interrupted while sleeping on applications finish on shutdown", - ex); - } - } - break; - case RESYNC: - while (!applications.isEmpty()) { - try { - Thread.sleep(1000); - } catch (InterruptedException ex) { - LOG.warn("Interrupted while sleeping on applications finish on resync", - ex); - } - } - break; - default: - throw new YarnRuntimeException("Get an unknown NodeManagerEventType: " - + eventType); + long waitStartTime = System.currentTimeMillis(); + while (!applications.isEmpty() + && System.currentTimeMillis() - waitStartTime < waitForContainersOnShutdownMillis) { + try { + Thread.sleep(1000); + } catch (InterruptedException ex) { + LOG.warn( + "Interrupted while sleeping on applications finish on shutdown", ex); + } } // All applications Finished @@ -374,6 +355,40 @@ public void cleanUpApplications(NodeManagerEventType eventType) { } } + public void cleanupContainersOnNMResync() { + Map containers = context.getContainers(); + if (containers.isEmpty()) { + return; + } + LOG.info("Containers still running on " + + CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC + " : " + + containers.keySet()); + + List containerIds = + new ArrayList(containers.keySet()); + + LOG.info("Waiting for containers to be killed"); + + this.handle(new CMgrCompletedContainersEvent(containerIds, + CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC)); + while (!containers.isEmpty()) { + try { + Thread.sleep(1000); + nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext(); + } catch (InterruptedException ex) { + LOG.warn("Interrupted while sleeping on container kill on resync", ex); + } + } + + // All containers killed + if (containers.isEmpty()) { + LOG.info("All containers in DONE state"); + } else { + LOG.info("Done waiting for containers to be killed. Still alive: " + + containers.keySet()); + } + } + // Get the remoteUGI corresponding to the api call. protected UserGroupInformation getRemoteUgi() throws YarnException { @@ -850,7 +865,7 @@ public void handle(ContainerManagerEvent event) { break; default: throw new YarnRuntimeException( - "Get an unknown ContainerManagerEvent type: " + event.getType()); + "Got an unknown ContainerManagerEvent type: " + event.getType()); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java index 3e0846b042..f2090fac0f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java @@ -102,7 +102,11 @@ public void testKillContainersOnResync() throws IOException, } catch (BrokenBarrierException e) { } Assert.assertEquals(2, ((TestNodeManager1) nm).getNMRegistrationCount()); - + // Only containers should be killed on resync, apps should lie around. That + // way local resources for apps can be used beyond resync without + // relocalization + Assert.assertTrue(nm.getNMContext().getApplications() + .containsKey(cId.getApplicationAttemptId().getApplicationId())); Assert.assertFalse(assertionFailedInThread.get()); nm.stop(); @@ -285,7 +289,6 @@ public void run() { recordFactory.newRecordInstance(ContainerLaunchContext.class); try { while (!isStopped && numContainers < 10) { - ContainerId cId = TestNodeManagerShutdown.createContainerId(); StartContainerRequest scRequest = StartContainerRequest.newInstance(containerLaunchContext, null); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 8372aff851..f356a2ab83 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -93,6 +93,7 @@ import org.junit.Before; import org.junit.Test; +@SuppressWarnings("rawtypes") public class TestNodeStatusUpdater { // temp fix until metrics system can auto-detect itself running in unit test: @@ -352,7 +353,6 @@ protected boolean isTokenKeepAliveEnabled(Configuration conf) { private class MyNodeStatusUpdater4 extends NodeStatusUpdaterImpl { - private Context context; private final long rmStartIntervalMS; private final boolean rmNeverStart; public ResourceTracker resourceTracker; @@ -360,7 +360,6 @@ public MyNodeStatusUpdater4(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, long rmStartIntervalMS, boolean rmNeverStart) { super(context, dispatcher, healthChecker, metrics); - this.context = context; this.rmStartIntervalMS = rmStartIntervalMS; this.rmNeverStart = rmNeverStart; } @@ -376,8 +375,8 @@ protected ResourceTracker getRMClient() throws IOException { RetryPolicy retryPolicy = RMProxy.createRetryPolicy(conf); resourceTracker = (ResourceTracker) RetryProxy.create(ResourceTracker.class, - new MyResourceTracker6(this.context, rmStartIntervalMS, - rmNeverStart), retryPolicy); + new MyResourceTracker6(rmStartIntervalMS, rmNeverStart), + retryPolicy); return resourceTracker; } @@ -685,14 +684,11 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) private class MyResourceTracker6 implements ResourceTracker { - private final Context context; private long rmStartIntervalMS; private boolean rmNeverStart; private final long waitStartTime; - public MyResourceTracker6(Context context, long rmStartIntervalMS, - boolean rmNeverStart) { - this.context = context; + public MyResourceTracker6(long rmStartIntervalMS, boolean rmNeverStart) { this.rmStartIntervalMS = rmStartIntervalMS; this.rmNeverStart = rmNeverStart; this.waitStartTime = System.currentTimeMillis(); @@ -868,8 +864,8 @@ protected ContainerManagerImpl createContainerManager(Context context, metrics, aclsManager, dirsHandler) { @Override - public void cleanUpApplications(NodeManagerEventType eventType) { - super.cleanUpApplications(NodeManagerEventType.SHUTDOWN); + public void cleanUpApplicationsOnNMShutDown() { + super.cleanUpApplicationsOnNMShutDown(); numCleanups.incrementAndGet(); } }; @@ -1222,8 +1218,8 @@ protected ContainerManagerImpl createContainerManager(Context context, metrics, aclsManager, dirsHandler) { @Override - public void cleanUpApplications(NodeManagerEventType eventType) { - super.cleanUpApplications(NodeManagerEventType.SHUTDOWN); + public void cleanUpApplicationsOnNMShutDown() { + super.cleanUpApplicationsOnNMShutDown(); numCleanups.incrementAndGet(); } };