diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java index fcbcfdf1b8..cb2e08143e 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -208,8 +208,49 @@ static enum State { */ public ActiveStandbyElector(String zookeeperHostPorts, int zookeeperSessionTimeout, String parentZnodeName, List acl, - List authInfo, - ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException, + List authInfo, ActiveStandbyElectorCallback app, + int maxRetryNum) throws IOException, HadoopIllegalArgumentException, + KeeperException { + this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl, + authInfo, app, maxRetryNum, true); + } + + /** + * Create a new ActiveStandbyElector object
+ * The elector is created by providing to it the Zookeeper configuration, the + * parent znode under which to create the znode and a reference to the + * callback interface.
+ * The parent znode name must be the same for all service instances and + * different across services.
+ * After the leader has been lost, a new leader will be elected after the + * session timeout expires. Hence, the app must set this parameter based on + * its needs for failure response time. The session timeout must be greater + * than the Zookeeper disconnect timeout and is recommended to be 3X that + * value to enable Zookeeper to retry transient disconnections. Setting a very + * short session timeout may result in frequent transitions between active and + * standby states during issues like network outages/GS pauses. + * + * @param zookeeperHostPorts + * ZooKeeper hostPort for all ZooKeeper servers + * @param zookeeperSessionTimeout + * ZooKeeper session timeout + * @param parentZnodeName + * znode under which to create the lock + * @param acl + * ZooKeeper ACL's + * @param authInfo a list of authentication credentials to add to the + * ZK connection + * @param app + * reference to callback interface object + * @param failFast + * whether need to add the retry when establishing ZK connection. + * @throws IOException + * @throws HadoopIllegalArgumentException + */ + public ActiveStandbyElector(String zookeeperHostPorts, + int zookeeperSessionTimeout, String parentZnodeName, List acl, + List authInfo, ActiveStandbyElectorCallback app, + int maxRetryNum, boolean failFast) throws IOException, HadoopIllegalArgumentException, KeeperException { if (app == null || acl == null || parentZnodeName == null || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { @@ -225,8 +266,12 @@ public ActiveStandbyElector(String zookeeperHostPorts, zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; this.maxRetryNum = maxRetryNum; - // createConnection for future API calls - createConnection(); + // establish the ZK Connection for future API calls + if (failFast) { + createConnection(); + } else { + reEstablishSession(); + } } /** diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 024255cd90..9f35307ca2 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED YARN-3985. Make ReservationSystem persist state using RMStateStore reservation APIs. (adhoot via asuresh) + YARN-4243. Add retry on establishing Zookeeper conenction in + EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 3e89259ffc..913b5dfd86 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -531,6 +531,10 @@ private static void addDeprecatedKeys() { public static final int DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0; + /** number of zookeeper operation retry times in ActiveStandbyElector */ + public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX + + "failover-controller.active-standby-elector.zk.retries"; + //////////////////////////////// // RM state store configs //////////////////////////////// diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 5dc45902fe..c6ffe18be6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -388,6 +388,13 @@ org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore + + When automatic failover is enabled, number of zookeeper + operation retry times in ActiveStandbyElector + yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries + + + The maximum number of completed applications RM state store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java index 73bdca0206..72327e82e9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java @@ -86,11 +86,12 @@ protected void serviceInit(Configuration conf) List zkAcls = RMZKUtils.getZKAcls(conf); List zkAuths = RMZKUtils.getZKAuths(conf); - int maxRetryNum = conf.getInt( - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + int maxRetryNum = + conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf + .getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT)); elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout, - electionZNode, zkAcls, zkAuths, this, maxRetryNum); + electionZNode, zkAcls, zkAuths, this, maxRetryNum, false); elector.ensureParentZNode(); if (!isParentZnodeSafe(clusterId)) {