From be50d221f5e7a7df13b62b672909ecbe2e2c027b Mon Sep 17 00:00:00 2001 From: cxzl25 Date: Wed, 10 May 2023 15:16:33 +0800 Subject: [PATCH] YARN-11467. RM failover may fail when the nodes.exclude-path file does not exist (#5565) --- .../resourcemanager/NodesListManager.java | 6 +- .../yarn/server/resourcemanager/TestRMHA.java | 103 ++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java index afa85483d1..21be92169a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java @@ -220,7 +220,11 @@ private void printConfiguredHosts(boolean graceful) { public void refreshNodes(Configuration yarnConf) throws IOException, YarnException { - refreshNodes(yarnConf, false); + try { + refreshNodes(yarnConf, false); + } catch (YarnException | IOException ex) { + disableHostsFileReader(ex); + } } public void refreshNodes(Configuration yarnConf, boolean graceful) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java index 83d81ec9eb..cdaea56fe4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java @@ -18,6 +18,10 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import java.io.DataOutputStream; +import java.io.File; +import java.nio.file.Files; +import java.util.UUID; import java.util.function.Supplier; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; @@ -742,6 +746,105 @@ public void testResourceProfilesManagerAfterRMWentStandbyThenBackToActive() rm.getRMContext().getResourceProfilesManager()); } + @Test + public void testTransitionedToActiveWithExcludeFileNotExist() throws Exception { + final String errUnforcedRequest = "User request succeeded even when " + + "automatic failover is enabled"; + + Configuration conf = new YarnConfiguration(configuration); + String nodeExcludeFilePath = "/tmp/non-existent-path-" + UUID.randomUUID(); + conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, nodeExcludeFilePath); + + DataOutputStream output = null; + final File confFile = + new File("target/test-classes/"+YarnConfiguration.YARN_SITE_CONFIGURATION_FILE); + final File backupConfFile = new File( + "target/test-classes/" + YarnConfiguration.YARN_SITE_CONFIGURATION_FILE + + ".backup." + UUID.randomUUID()); + boolean hasRenamed = false; + try { + if (confFile.exists()) { + hasRenamed = confFile.renameTo(backupConfFile); + if (!hasRenamed) { + Assert.fail("Can not rename " + confFile.getAbsolutePath() + " to " + + backupConfFile.getAbsolutePath()); + } + } + if (!confFile.createNewFile()) { + Assert.fail( + "Can not create " + YarnConfiguration.YARN_SITE_CONFIGURATION_FILE); + } + output = new DataOutputStream(Files.newOutputStream(confFile.toPath())); + conf.writeXml(output); + } finally { + if (output != null) { + output.close(); + } + } + + try { + rm = new MockRM(conf); + rm.init(conf); + rm.start(); + StateChangeRequestInfo requestInfo = new StateChangeRequestInfo( + HAServiceProtocol.RequestSource.REQUEST_BY_USER); + + // Transition to standby + try { + rm.adminService.transitionToStandby(requestInfo); + fail(errUnforcedRequest); + } catch (AccessControlException e) { + // expected + } + checkMonitorHealth(); + checkStandbyRMFunctionality(); + + // Transition to active + try { + rm.adminService.transitionToActive(requestInfo); + fail(errUnforcedRequest); + } catch (AccessControlException e) { + // expected + } + checkMonitorHealth(); + checkStandbyRMFunctionality(); + + final String errForcedRequest = + "Forced request by user should work " + "even if automatic failover is enabled"; + requestInfo = new StateChangeRequestInfo( + HAServiceProtocol.RequestSource.REQUEST_BY_USER_FORCED); + + // Transition to standby + try { + rm.adminService.transitionToStandby(requestInfo); + } catch (AccessControlException e) { + fail(errForcedRequest); + } + checkMonitorHealth(); + checkStandbyRMFunctionality(); + + // Transition to active + try { + rm.adminService.transitionToActive(requestInfo); + } catch (AccessControlException e) { + fail(errForcedRequest); + } + checkMonitorHealth(); + checkActiveRMFunctionality(); + } finally { + if (confFile.exists()) { + if (!hasRenamed) { + confFile.delete(); + } else { + backupConfFile.renameTo(confFile); + } + } + if (rm != null) { + rm.stop(); + } + } + } + public void innerTestHAWithRMHostName(boolean includeBindHost) { //this is run two times, with and without a bind host configured if (includeBindHost) {