From 86c250a54a586b1db098121c0c052cb3580fe5a4 Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Tue, 30 May 2023 01:37:08 +0800 Subject: [PATCH] YARN-7720. Race condition between second app attempt and UAM timeout when first attempt node is down. (#5672) --- .../hadoop/yarn/conf/YarnConfiguration.java | 3 ++- .../yarn/util/AbstractLivelinessMonitor.java | 2 +- .../src/main/resources/yarn-default.xml | 7 +++-- .../yarn/conf/TestYarnConfiguration.java | 19 +++++++++++++ .../AMRMProxyTokenSecretManager.java | 20 +++++++++----- .../resourcemanager/ResourceManager.java | 27 +++++++++++++++++++ .../rmapp/attempt/AMLivelinessMonitor.java | 14 ++++++++-- .../security/AMRMTokenSecretManager.java | 19 +++++++++---- 8 files changed, 94 insertions(+), 17 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 123a6de6a4..b6601b835d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -394,7 +394,8 @@ private static void addDeprecatedKeys() { /** The expiry interval for application master reporting.*/ public static final String RM_AM_EXPIRY_INTERVAL_MS = YARN_PREFIX + "am.liveness-monitor.expiry-interval-ms"; - public static final int DEFAULT_RM_AM_EXPIRY_INTERVAL_MS = 600000; + public static final long DEFAULT_RM_AM_EXPIRY_INTERVAL_MS = + TimeUnit.MINUTES.toMillis(15); /** How long to wait until a node manager is considered dead.*/ public static final String RM_NM_EXPIRY_INTERVAL_MS = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java index 3a9c0e8d8f..0ae6c47d0e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java @@ -83,7 +83,7 @@ protected void serviceStop() throws Exception { protected abstract void expire(O ob); - protected void setExpireInterval(int expireInterval) { + protected void setExpireInterval(long expireInterval) { this.expireInterval = expireInterval; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index d89f048ed3..1f0982aede 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -169,9 +169,12 @@ - The expiry interval for application master reporting. + + The expiry interval for application master reporting. + The default is 900000 ms, or 15m. + yarn.am.liveness-monitor.expiry-interval-ms - 600000 + 900000 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java index e4547a9163..3b76d41af7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java @@ -19,7 +19,9 @@ package org.apache.hadoop.yarn.conf; import java.net.InetSocketAddress; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.math.NumberUtils; import org.junit.jupiter.api.Test; import org.apache.hadoop.yarn.webapp.util.WebAppUtils; @@ -247,4 +249,21 @@ void testUpdateConnectAddr() throws Exception { assertNull(conf.get( HAUtil.addSuffix(YarnConfiguration.NM_LOCALIZER_ADDRESS, "rm1"))); } + + @Test + void checkRmAmExpiryIntervalSetting() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + + // 30m, 1800000ms + conf.set(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, "30m"); + long rmAmExpiryIntervalMS = conf.getTimeDuration(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS); + assertEquals(1800000, rmAmExpiryIntervalMS); + + // 10m, 600000ms + conf.set(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, "600000"); + String rmAmExpiryIntervalMS1 = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + assertTrue(NumberUtils.isDigits(rmAmExpiryIntervalMS1)); + assertEquals(600000, Long.parseLong(rmAmExpiryIntervalMS1)); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java index d1468c4111..ea4e979401 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java @@ -24,9 +24,12 @@ import java.util.Set; import java.util.Timer; import java.util.TimerTask; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.commons.lang3.math.NumberUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -88,12 +91,17 @@ public void init(Configuration conf) { YarnConfiguration.DEFAULT_RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS) * 1000; // Adding delay = 1.5 * expiry interval makes sure that all active AMs get // the updated shared-key. - this.activationDelay = - (long) (conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, - YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS) * 1.5); - LOG.info("AMRMTokenKeyRollingInterval: " + this.rollingInterval - + "ms and AMRMTokenKeyActivationDelay: " + this.activationDelay - + " ms"); + String rmAmExpiryIntervalMS = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + if (NumberUtils.isDigits(rmAmExpiryIntervalMS)) { + this.activationDelay = (long) (conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS) * 1.5); + } else { + this.activationDelay = (long) (conf.getTimeDuration( + YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS) * 1.5); + } + LOG.info("AMRMTokenKeyRollingInterval: {} ms and AMRMTokenKeyActivationDelay: {} ms.", + this.rollingInterval, this.activationDelay); if (rollingInterval <= activationDelay * 2) { throw new IllegalArgumentException( YarnConfiguration.RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index da500add4c..52aa466b3f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.classification.VisibleForTesting; import com.sun.jersey.spi.container.servlet.ServletContainer; @@ -703,6 +704,32 @@ protected static void validateConfigs(Configuration conf) { + ", " + YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS + "=" + heartbeatIntvl); } + + if (HAUtil.isFederationEnabled(conf)) { + /* + * In Yarn Federation, we need UAMs in secondary sub-clusters to stay + * alive when the next attempt AM in home sub-cluster gets launched. If + * the previous AM died because the node is lost after NM timeout. It will + * already be too late if AM timeout is even shorter. + */ + String rmAmExpiryIntervalMS = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + long amExpireIntvl; + if (NumberUtils.isDigits(rmAmExpiryIntervalMS)) { + amExpireIntvl = conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS); + } else { + amExpireIntvl = conf.getTimeDuration(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS); + } + + if (amExpireIntvl <= expireIntvl) { + throw new YarnRuntimeException("When Yarn Federation is enabled, " + + "AM expiry interval should be no less than NM expiry interval, " + + YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS + "=" + amExpireIntvl + + ", " + YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS + "=" + + expireIntvl); + } + } } /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java index 7006e500b9..d85b044b84 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.conf.YarnConfiguration; @@ -27,6 +28,8 @@ import org.apache.hadoop.yarn.util.AbstractLivelinessMonitor; import org.apache.hadoop.yarn.util.Clock; +import java.util.concurrent.TimeUnit; + public class AMLivelinessMonitor extends AbstractLivelinessMonitor { private EventHandler dispatcher; @@ -43,8 +46,15 @@ public AMLivelinessMonitor(Dispatcher d, Clock clock) { public void serviceInit(Configuration conf) throws Exception { super.serviceInit(conf); - int expireIntvl = conf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, - YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS); + long expireIntvl; + String rmAmExpiryIntervalMS = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + if (NumberUtils.isDigits(rmAmExpiryIntervalMS)) { + expireIntvl = conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS); + } else { + expireIntvl = conf.getTimeDuration(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS); + } setExpireInterval(expireIntvl); setMonitorInterval(expireIntvl/3); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/security/AMRMTokenSecretManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/security/AMRMTokenSecretManager.java index 59fdc57234..337b585177 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/security/AMRMTokenSecretManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/security/AMRMTokenSecretManager.java @@ -24,10 +24,12 @@ import java.util.Set; import java.util.Timer; import java.util.TimerTask; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import org.apache.commons.lang3.math.NumberUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; @@ -90,11 +92,18 @@ public AMRMTokenSecretManager(Configuration conf, RMContext rmContext) { YarnConfiguration.DEFAULT_RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS) * 1000; // Adding delay = 1.5 * expiry interval makes sure that all active AMs get // the updated shared-key. - this.activationDelay = - (long) (conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, - YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS) * 1.5); - LOG.info("AMRMTokenKeyRollingInterval: " + this.rollingInterval - + "ms and AMRMTokenKeyActivationDelay: " + this.activationDelay + " ms"); + String rmAmExpiryIntervalMS = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + if (NumberUtils.isDigits(rmAmExpiryIntervalMS)) { + this.activationDelay = (long) (conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS) * 1.5); + } else { + this.activationDelay = + (long) (conf.getTimeDuration(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS) * 1.5); + } + + LOG.info("AMRMTokenKeyRollingInterval: {} ms and AMRMTokenKeyActivationDelay: {} ms", + this.rollingInterval, this.activationDelay); if (rollingInterval <= activationDelay * 2) { throw new IllegalArgumentException( YarnConfiguration.RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS