YARN-4243. Add retry on establishing Zookeeper conenction in EmbeddedElectorService#serviceInit. Contributed by Xuan Gong.
This commit is contained in:
parent
960201b79b
commit
0fce5f9a49
@ -208,8 +208,49 @@ static enum State {
|
|||||||
*/
|
*/
|
||||||
public ActiveStandbyElector(String zookeeperHostPorts,
|
public ActiveStandbyElector(String zookeeperHostPorts,
|
||||||
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
|
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
|
||||||
List<ZKAuthInfo> authInfo,
|
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
|
||||||
ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
|
int maxRetryNum) throws IOException, HadoopIllegalArgumentException,
|
||||||
|
KeeperException {
|
||||||
|
this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl,
|
||||||
|
authInfo, app, maxRetryNum, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new ActiveStandbyElector object <br/>
|
||||||
|
* The elector is created by providing to it the Zookeeper configuration, the
|
||||||
|
* parent znode under which to create the znode and a reference to the
|
||||||
|
* callback interface. <br/>
|
||||||
|
* The parent znode name must be the same for all service instances and
|
||||||
|
* different across services. <br/>
|
||||||
|
* After the leader has been lost, a new leader will be elected after the
|
||||||
|
* session timeout expires. Hence, the app must set this parameter based on
|
||||||
|
* its needs for failure response time. The session timeout must be greater
|
||||||
|
* than the Zookeeper disconnect timeout and is recommended to be 3X that
|
||||||
|
* value to enable Zookeeper to retry transient disconnections. Setting a very
|
||||||
|
* short session timeout may result in frequent transitions between active and
|
||||||
|
* standby states during issues like network outages/GS pauses.
|
||||||
|
*
|
||||||
|
* @param zookeeperHostPorts
|
||||||
|
* ZooKeeper hostPort for all ZooKeeper servers
|
||||||
|
* @param zookeeperSessionTimeout
|
||||||
|
* ZooKeeper session timeout
|
||||||
|
* @param parentZnodeName
|
||||||
|
* znode under which to create the lock
|
||||||
|
* @param acl
|
||||||
|
* ZooKeeper ACL's
|
||||||
|
* @param authInfo a list of authentication credentials to add to the
|
||||||
|
* ZK connection
|
||||||
|
* @param app
|
||||||
|
* reference to callback interface object
|
||||||
|
* @param failFast
|
||||||
|
* whether need to add the retry when establishing ZK connection.
|
||||||
|
* @throws IOException
|
||||||
|
* @throws HadoopIllegalArgumentException
|
||||||
|
*/
|
||||||
|
public ActiveStandbyElector(String zookeeperHostPorts,
|
||||||
|
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
|
||||||
|
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
|
||||||
|
int maxRetryNum, boolean failFast) throws IOException,
|
||||||
HadoopIllegalArgumentException, KeeperException {
|
HadoopIllegalArgumentException, KeeperException {
|
||||||
if (app == null || acl == null || parentZnodeName == null
|
if (app == null || acl == null || parentZnodeName == null
|
||||||
|| zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
|
|| zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
|
||||||
@ -225,8 +266,12 @@ public ActiveStandbyElector(String zookeeperHostPorts,
|
|||||||
zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
|
zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
|
||||||
this.maxRetryNum = maxRetryNum;
|
this.maxRetryNum = maxRetryNum;
|
||||||
|
|
||||||
// createConnection for future API calls
|
// establish the ZK Connection for future API calls
|
||||||
|
if (failFast) {
|
||||||
createConnection();
|
createConnection();
|
||||||
|
} else {
|
||||||
|
reEstablishSession();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED
|
|||||||
YARN-3985. Make ReservationSystem persist state using RMStateStore
|
YARN-3985. Make ReservationSystem persist state using RMStateStore
|
||||||
reservation APIs. (adhoot via asuresh)
|
reservation APIs. (adhoot via asuresh)
|
||||||
|
|
||||||
|
YARN-4243. Add retry on establishing Zookeeper conenction in
|
||||||
|
EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||||
|
@ -531,6 +531,10 @@ private static void addDeprecatedKeys() {
|
|||||||
public static final int
|
public static final int
|
||||||
DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
|
DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
|
||||||
|
|
||||||
|
/** number of zookeeper operation retry times in ActiveStandbyElector */
|
||||||
|
public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX
|
||||||
|
+ "failover-controller.active-standby-elector.zk.retries";
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// RM state store configs
|
// RM state store configs
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
|
@ -388,6 +388,13 @@
|
|||||||
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
|
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>When automatic failover is enabled, number of zookeeper
|
||||||
|
operation retry times in ActiveStandbyElector</description>
|
||||||
|
<name>yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries</name>
|
||||||
|
<!--<value>3</value>-->
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>The maximum number of completed applications RM state
|
<description>The maximum number of completed applications RM state
|
||||||
store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}.
|
store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}.
|
||||||
|
@ -86,11 +86,12 @@ protected void serviceInit(Configuration conf)
|
|||||||
List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
|
List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
|
||||||
List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
|
List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
|
||||||
|
|
||||||
int maxRetryNum = conf.getInt(
|
int maxRetryNum =
|
||||||
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
|
conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf
|
||||||
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
|
.getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
|
||||||
|
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT));
|
||||||
elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
|
elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
|
||||||
electionZNode, zkAcls, zkAuths, this, maxRetryNum);
|
electionZNode, zkAcls, zkAuths, this, maxRetryNum, false);
|
||||||
|
|
||||||
elector.ensureParentZNode();
|
elector.ensureParentZNode();
|
||||||
if (!isParentZnodeSafe(clusterId)) {
|
if (!isParentZnodeSafe(clusterId)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user