From 3fa0d540dfca579f3c2840a959b748a7528b02ed Mon Sep 17 00:00:00 2001 From: Sunil G Date: Mon, 23 Jan 2017 12:19:35 +0530 Subject: [PATCH] YARN-6031. Application recovery has failed when node label feature is turned off during RM recovery. Contributed by Ying Zhang. --- .../server/resourcemanager/RMAppManager.java | 54 ++++++++++++++- .../server/resourcemanager/TestRMRestart.java | 69 +++++++++++++++++++ 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java index 47eec54928..8c6ba09317 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException; +import org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.ipc.RPCUtil; import org.apache.hadoop.yarn.security.AccessRequest; @@ -63,6 +64,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; @@ -336,6 +338,34 @@ protected void recoverApplication(ApplicationStateData appState, createAndPopulateNewRMApp(appContext, appState.getSubmitTime(), appState.getUser(), true, appState.getStartTime()); + // If null amReq has been returned, check if it is the case that + // application has specified node label expression while node label + // has been disabled. Reject the recovery of this application if it + // is true and give clear message so that user can react properly. + if (!appContext.getUnmanagedAM() && + application.getAMResourceRequest() == null && + !YarnConfiguration.areNodeLabelsEnabled(this.conf)) { + // check application submission context and see if am resource request + // or application itself contains any node label expression. + ResourceRequest amReqFromAppContext = + appContext.getAMContainerResourceRequest(); + String labelExp = (amReqFromAppContext != null) ? + amReqFromAppContext.getNodeLabelExpression() : null; + if (labelExp == null) { + labelExp = appContext.getNodeLabelExpression(); + } + if (labelExp != null && + !labelExp.equals(RMNodeLabelsManager.NO_LABEL)) { + String message = "Failed to recover application " + appId + + ". NodeLabel is not enabled in cluster, but AM resource request " + + "contains a label expression."; + LOG.warn(message); + application.handle( + new RMAppEvent(appId, RMAppEventType.APP_REJECTED, message)); + return; + } + } + application.handle(new RMAppRecoverEvent(appId, rmState)); } @@ -355,8 +385,28 @@ private RMAppImpl createAndPopulateNewRMApp( } ApplicationId applicationId = submissionContext.getApplicationId(); - ResourceRequest amReq = - validateAndCreateResourceRequest(submissionContext, isRecovery); + ResourceRequest amReq = null; + try { + amReq = validateAndCreateResourceRequest(submissionContext, isRecovery); + } catch (InvalidLabelResourceRequestException e) { + // This can happen if the application had been submitted and run + // with Node Label enabled but recover with Node Label disabled. + // Thus there might be node label expression in the application's + // resource requests. If this is the case, create RmAppImpl with + // null amReq and reject the application later with clear error + // message. So that the application can still be tracked by RM + // after recovery and user can see what's going on and react accordingly. + if (isRecovery && + !YarnConfiguration.areNodeLabelsEnabled(this.conf)) { + if (LOG.isDebugEnabled()) { + LOG.debug("AMResourceRequest is not created for " + applicationId + + ". NodeLabel is not enabled in cluster, but AM resource " + + "request contains a label expression."); + } + } else { + throw e; + } + } // Verify and get the update application priority and set back to // submissionContext diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index 9d4b639df5..f0b093abf8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -110,6 +110,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.timelineservice.collector.TimelineCollectorContext; @@ -2539,4 +2540,72 @@ private MockAM launchAndFailAM(RMApp app, MockRM rm, MockNM nm) rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FAILED); return am; } + + @Test(timeout = 60000) + public void testRMRestartAfterNodeLabelDisabled() throws Exception { + MemoryRMStateStore memStore = new MemoryRMStateStore(); + memStore.init(conf); + + conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true); + + MockRM rm1 = new MockRM( + TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) { + @Override + protected RMNodeLabelsManager createNodeLabelManager() { + RMNodeLabelsManager mgr = new RMNodeLabelsManager(); + mgr.init(getConfig()); + return mgr; + } + }; + rm1.start(); + + // add node label "x" and set node to label mapping + Set clusterNodeLabels = new HashSet(); + clusterNodeLabels.add("x"); + RMNodeLabelsManager nodeLabelManager = + rm1.getRMContext().getNodeLabelManager(); + nodeLabelManager. + addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels); + nodeLabelManager.addLabelsToNode( + ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("x"))); + MockNM nm1 = rm1.registerNode("h1:1234", 8000); // label = x + + // submit an application with specifying am node label expression as "x" + RMApp app1 = rm1.submitApp(200, "someApp", "someUser", null, "a1", "x"); + // check am container allocated with correct node label expression + MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); + ContainerId amContainerId1 = + ContainerId.newContainerId(am1.getApplicationAttemptId(), 1); + Assert.assertEquals("x", rm1.getRMContext().getScheduler(). + getRMContainer(amContainerId1).getNodeLabelExpression()); + finishApplicationMaster(app1, rm1, nm1, am1); + + // restart rm with node label disabled + conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, false); + MockRM rm2 = new MockRM( + TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) { + @Override + protected RMNodeLabelsManager createNodeLabelManager() { + RMNodeLabelsManager mgr = new RMNodeLabelsManager(); + mgr.init(getConfig()); + return mgr; + } + }; + + // rm should successfully start with app1 loaded back in FAILED state + // due to node label not enabled but am resource request contains + // node label expression. + try { + rm2.start(); + Assert.assertTrue("RM start successfully", true); + Assert.assertEquals(1, rm2.getRMContext().getRMApps().size()); + rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED); + } catch (Exception e) { + LOG.debug("Exception on start", e); + Assert.fail("RM should start without any issue"); + } finally { + rm1.stop(); + rm2.stop(); + } + } }