YARN-10212. Create separate configuration for max global AM attempts. Contributed by Bilwa S T

(cherry picked from commit 57659422abbf6d9bf52e6e27fca775254bb77a56)
This commit is contained in:
Jonathan Hung 2020-04-09 10:23:05 -07:00
parent e4331a73c9
commit e1dd78143b
6 changed files with 70 additions and 18 deletions

View File

@ -502,12 +502,19 @@ public static boolean isAclEnabled(Configuration conf) {
public static final int DEFAULT_RM_ADMIN_CLIENT_THREAD_COUNT = 1;
/**
* The maximum number of application attempts.
* It's a global setting for all application masters.
* The maximum number of application attempts for
* an application, if unset by user.
*/
public static final String RM_AM_MAX_ATTEMPTS =
RM_PREFIX + "am.max-attempts";
public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2;
/**
* The maximum number of application attempts.
* It's a global setting for all application masters.
*/
public static final String GLOBAL_RM_AM_MAX_ATTEMPTS =
RM_PREFIX + "am.global.max-attempts";
/** The keytab for the resource manager.*/
public static final String RM_KEYTAB =

View File

@ -327,11 +327,10 @@
</property>
<property>
<description>The maximum number of application attempts. It's a global
setting for all application masters. Each application master can specify
its individual maximum number of application attempts via the API, but the
individual number cannot be more than the global upper bound. If it is,
the resourcemanager will override it. The default number is set to 2, to
<description>The default maximum number of application attempts, if unset by
the user. Each application master can specify its individual maximum number of application
attempts via the API, but the individual number cannot be more than the global upper bound in
yarn.resourcemanager.am.global.max-attempts. The default number is set to 2, to
allow at least one retry for AM.</description>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>2</value>
@ -4534,4 +4533,18 @@
<name>yarn.webapp.enable-rest-app-submissions</name>
<value>true</value>
</property>
<property>
<description>
The maximum number of application attempts. It's a global
setting for all application masters. Each application master can specify
its individual maximum number of application attempts via the API, but the
individual number cannot be more than the global upper bound. If it is,
the resourcemanager will override it. The default number value is set to
yarn.resourcemanager.am.max-attempts.
</description>
<name>yarn.resourcemanager.am.global.max-attempts</name>
<value></value>
</property>
</configuration>

View File

@ -613,12 +613,20 @@ protected SystemMetricsPublisher createSystemMetricsPublisher() {
// sanity check for configurations
protected static void validateConfigs(Configuration conf) {
// validate max-attempts
int globalMaxAppAttempts =
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
int rmMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
if (rmMaxAppAttempts <= 0) {
throw new YarnRuntimeException("Invalid rm am max attempts configuration"
+ ", " + YarnConfiguration.RM_AM_MAX_ATTEMPTS
+ "=" + rmMaxAppAttempts + ", it should be a positive integer.");
}
int globalMaxAppAttempts = conf.getInt(
YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
if (globalMaxAppAttempts <= 0) {
throw new YarnRuntimeException("Invalid global max attempts configuration"
+ ", " + YarnConfiguration.RM_AM_MAX_ATTEMPTS
+ ", " + YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS
+ "=" + globalMaxAppAttempts + ", it should be a positive integer.");
}

View File

@ -453,11 +453,20 @@ public RMAppImpl(ApplicationId applicationId, RMContext rmContext,
this.applicationPriority = Priority.newInstance(0);
}
int globalMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
int globalMaxAppAttempts = conf.getInt(
YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
int rmMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
int individualMaxAppAttempts = submissionContext.getMaxAppAttempts();
if (individualMaxAppAttempts <= 0 ||
individualMaxAppAttempts > globalMaxAppAttempts) {
if (individualMaxAppAttempts <= 0) {
this.maxAppAttempts = rmMaxAppAttempts;
LOG.warn("The specific max attempts: " + individualMaxAppAttempts
+ " for application: " + applicationId.getId()
+ " is invalid, because it is less than or equal to zero."
+ " Use the rm max attempts instead.");
} else if (individualMaxAppAttempts > globalMaxAppAttempts) {
this.maxAppAttempts = globalMaxAppAttempts;
LOG.warn("The specific max attempts: " + individualMaxAppAttempts
+ " for application: " + applicationId.getId()
@ -1211,8 +1220,9 @@ private String getAppAttemptFailedDiagnostics(RMAppEvent event) {
+ " failed due to " + failedEvent.getDiagnosticMsg()
+ ". Failing the application.";
} else if (this.isNumAttemptsBeyondThreshold) {
int globalLimit = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
int globalLimit = conf.getInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
msg = String.format(
"Application %s failed %d times%s%s due to %s. Failing the application.",
getApplicationId(),

View File

@ -980,17 +980,20 @@ public void testRMAppSubmitWithInvalidTokens() throws Exception {
@Test (timeout = 30000)
public void testRMAppSubmitMaxAppAttempts() throws Exception {
int[] globalMaxAppAttempts = new int[] { 10, 1 };
int[] rmAmMaxAttempts = new int[] { 8, 1 };
int[][] individualMaxAppAttempts = new int[][]{
new int[]{ 9, 10, 11, 0 },
new int[]{ 1, 10, 0, -1 }};
int[][] expectedNums = new int[][]{
new int[]{ 9, 10, 10, 10 },
new int[]{ 9, 10, 10, 8 },
new int[]{ 1, 1, 1, 1 }};
for (int i = 0; i < globalMaxAppAttempts.length; ++i) {
for (int j = 0; j < individualMaxAppAttempts.length; ++j) {
ResourceScheduler scheduler = mockResourceScheduler();
Configuration conf = new Configuration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, globalMaxAppAttempts[i]);
conf.setInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
globalMaxAppAttempts[i]);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, rmAmMaxAttempts[i]);
ApplicationMasterService masterService =
new ApplicationMasterService(rmContext, scheduler);
TestRMAppManager appMonitor = new TestRMAppManager(rmContext,

View File

@ -237,7 +237,7 @@ private void checkResourceUsage(
@Test (timeout = 30000)
public void testResourceManagerInitConfigValidation() throws Exception {
Configuration conf = new YarnConfiguration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, -1);
conf.setInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, -1);
try {
resourceManager = new MockRM(conf);
fail("Exception is expected because the global max attempts" +
@ -247,6 +247,17 @@ public void testResourceManagerInitConfigValidation() throws Exception {
if (!e.getMessage().startsWith(
"Invalid global max attempts configuration")) throw e;
}
Configuration yarnConf = new YarnConfiguration();
yarnConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, -1);
try {
resourceManager = new MockRM(yarnConf);
fail("Exception is expected because AM max attempts" +
" is negative.");
} catch (YarnRuntimeException e) {
// Exception is expected.
if (!e.getMessage().startsWith(
"Invalid rm am max attempts configuration")) throw e;
}
}
@Test