YARN-4243. Add retry on establishing Zookeeper conenction in EmbeddedElectorService#serviceInit. Contributed by Xuan Gong.
This commit is contained in:
parent
960201b79b
commit
0fce5f9a49
@ -208,8 +208,49 @@ static enum State {
|
||||
*/
|
||||
public ActiveStandbyElector(String zookeeperHostPorts,
|
||||
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
|
||||
List<ZKAuthInfo> authInfo,
|
||||
ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
|
||||
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
|
||||
int maxRetryNum) throws IOException, HadoopIllegalArgumentException,
|
||||
KeeperException {
|
||||
this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl,
|
||||
authInfo, app, maxRetryNum, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ActiveStandbyElector object <br/>
|
||||
* The elector is created by providing to it the Zookeeper configuration, the
|
||||
* parent znode under which to create the znode and a reference to the
|
||||
* callback interface. <br/>
|
||||
* The parent znode name must be the same for all service instances and
|
||||
* different across services. <br/>
|
||||
* After the leader has been lost, a new leader will be elected after the
|
||||
* session timeout expires. Hence, the app must set this parameter based on
|
||||
* its needs for failure response time. The session timeout must be greater
|
||||
* than the Zookeeper disconnect timeout and is recommended to be 3X that
|
||||
* value to enable Zookeeper to retry transient disconnections. Setting a very
|
||||
* short session timeout may result in frequent transitions between active and
|
||||
* standby states during issues like network outages/GS pauses.
|
||||
*
|
||||
* @param zookeeperHostPorts
|
||||
* ZooKeeper hostPort for all ZooKeeper servers
|
||||
* @param zookeeperSessionTimeout
|
||||
* ZooKeeper session timeout
|
||||
* @param parentZnodeName
|
||||
* znode under which to create the lock
|
||||
* @param acl
|
||||
* ZooKeeper ACL's
|
||||
* @param authInfo a list of authentication credentials to add to the
|
||||
* ZK connection
|
||||
* @param app
|
||||
* reference to callback interface object
|
||||
* @param failFast
|
||||
* whether need to add the retry when establishing ZK connection.
|
||||
* @throws IOException
|
||||
* @throws HadoopIllegalArgumentException
|
||||
*/
|
||||
public ActiveStandbyElector(String zookeeperHostPorts,
|
||||
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
|
||||
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
|
||||
int maxRetryNum, boolean failFast) throws IOException,
|
||||
HadoopIllegalArgumentException, KeeperException {
|
||||
if (app == null || acl == null || parentZnodeName == null
|
||||
|| zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
|
||||
@ -225,8 +266,12 @@ public ActiveStandbyElector(String zookeeperHostPorts,
|
||||
zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
|
||||
this.maxRetryNum = maxRetryNum;
|
||||
|
||||
// createConnection for future API calls
|
||||
// establish the ZK Connection for future API calls
|
||||
if (failFast) {
|
||||
createConnection();
|
||||
} else {
|
||||
reEstablishSession();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED
|
||||
YARN-3985. Make ReservationSystem persist state using RMStateStore
|
||||
reservation APIs. (adhoot via asuresh)
|
||||
|
||||
YARN-4243. Add retry on establishing Zookeeper conenction in
|
||||
EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||
|
@ -531,6 +531,10 @@ private static void addDeprecatedKeys() {
|
||||
public static final int
|
||||
DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
|
||||
|
||||
/** number of zookeeper operation retry times in ActiveStandbyElector */
|
||||
public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX
|
||||
+ "failover-controller.active-standby-elector.zk.retries";
|
||||
|
||||
////////////////////////////////
|
||||
// RM state store configs
|
||||
////////////////////////////////
|
||||
|
@ -388,6 +388,13 @@
|
||||
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>When automatic failover is enabled, number of zookeeper
|
||||
operation retry times in ActiveStandbyElector</description>
|
||||
<name>yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries</name>
|
||||
<!--<value>3</value>-->
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>The maximum number of completed applications RM state
|
||||
store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}.
|
||||
|
@ -86,11 +86,12 @@ protected void serviceInit(Configuration conf)
|
||||
List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
|
||||
List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
|
||||
|
||||
int maxRetryNum = conf.getInt(
|
||||
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
|
||||
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
|
||||
int maxRetryNum =
|
||||
conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf
|
||||
.getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
|
||||
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT));
|
||||
elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
|
||||
electionZNode, zkAcls, zkAuths, this, maxRetryNum);
|
||||
electionZNode, zkAcls, zkAuths, this, maxRetryNum, false);
|
||||
|
||||
elector.ensureParentZNode();
|
||||
if (!isParentZnodeSafe(clusterId)) {
|
||||
|
Loading…
Reference in New Issue
Block a user