YARN-4243. Add retry on establishing Zookeeper conenction in EmbeddedElectorService#serviceInit. Contributed by Xuan Gong.

This commit is contained in:
Junping Du 2015-10-22 13:41:09 -07:00
parent 960201b79b
commit 0fce5f9a49
5 changed files with 68 additions and 8 deletions

View File

@ -208,8 +208,49 @@ static enum State {
*/ */
public ActiveStandbyElector(String zookeeperHostPorts, public ActiveStandbyElector(String zookeeperHostPorts,
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl, int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
List<ZKAuthInfo> authInfo, List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException, int maxRetryNum) throws IOException, HadoopIllegalArgumentException,
KeeperException {
this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl,
authInfo, app, maxRetryNum, true);
}
/**
* Create a new ActiveStandbyElector object <br/>
* The elector is created by providing to it the Zookeeper configuration, the
* parent znode under which to create the znode and a reference to the
* callback interface. <br/>
* The parent znode name must be the same for all service instances and
* different across services. <br/>
* After the leader has been lost, a new leader will be elected after the
* session timeout expires. Hence, the app must set this parameter based on
* its needs for failure response time. The session timeout must be greater
* than the Zookeeper disconnect timeout and is recommended to be 3X that
* value to enable Zookeeper to retry transient disconnections. Setting a very
* short session timeout may result in frequent transitions between active and
* standby states during issues like network outages/GS pauses.
*
* @param zookeeperHostPorts
* ZooKeeper hostPort for all ZooKeeper servers
* @param zookeeperSessionTimeout
* ZooKeeper session timeout
* @param parentZnodeName
* znode under which to create the lock
* @param acl
* ZooKeeper ACL's
* @param authInfo a list of authentication credentials to add to the
* ZK connection
* @param app
* reference to callback interface object
* @param failFast
* whether need to add the retry when establishing ZK connection.
* @throws IOException
* @throws HadoopIllegalArgumentException
*/
public ActiveStandbyElector(String zookeeperHostPorts,
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
int maxRetryNum, boolean failFast) throws IOException,
HadoopIllegalArgumentException, KeeperException { HadoopIllegalArgumentException, KeeperException {
if (app == null || acl == null || parentZnodeName == null if (app == null || acl == null || parentZnodeName == null
|| zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
@ -225,8 +266,12 @@ public ActiveStandbyElector(String zookeeperHostPorts,
zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
this.maxRetryNum = maxRetryNum; this.maxRetryNum = maxRetryNum;
// createConnection for future API calls // establish the ZK Connection for future API calls
if (failFast) {
createConnection(); createConnection();
} else {
reEstablishSession();
}
} }
/** /**

View File

@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED
YARN-3985. Make ReservationSystem persist state using RMStateStore YARN-3985. Make ReservationSystem persist state using RMStateStore
reservation APIs. (adhoot via asuresh) reservation APIs. (adhoot via asuresh)
YARN-4243. Add retry on establishing Zookeeper conenction in
EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du)
OPTIMIZATIONS OPTIMIZATIONS
YARN-3339. TestDockerContainerExecutor should pull a single image and not YARN-3339. TestDockerContainerExecutor should pull a single image and not

View File

@ -531,6 +531,10 @@ private static void addDeprecatedKeys() {
public static final int public static final int
DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0; DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
/** number of zookeeper operation retry times in ActiveStandbyElector */
public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX
+ "failover-controller.active-standby-elector.zk.retries";
//////////////////////////////// ////////////////////////////////
// RM state store configs // RM state store configs
//////////////////////////////// ////////////////////////////////

View File

@ -388,6 +388,13 @@
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value> <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
</property> </property>
<property>
<description>When automatic failover is enabled, number of zookeeper
operation retry times in ActiveStandbyElector</description>
<name>yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries</name>
<!--<value>3</value>-->
</property>
<property> <property>
<description>The maximum number of completed applications RM state <description>The maximum number of completed applications RM state
store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}. store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}.

View File

@ -86,11 +86,12 @@ protected void serviceInit(Configuration conf)
List<ACL> zkAcls = RMZKUtils.getZKAcls(conf); List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf); List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
int maxRetryNum = conf.getInt( int maxRetryNum =
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); .getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT));
elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout, elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
electionZNode, zkAcls, zkAuths, this, maxRetryNum); electionZNode, zkAcls, zkAuths, this, maxRetryNum, false);
elector.ensureParentZNode(); elector.ensureParentZNode();
if (!isParentZnodeSafe(clusterId)) { if (!isParentZnodeSafe(clusterId)) {