diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index fff1df6249..61813fc32d 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -420,6 +420,9 @@ Branch-2 ( Unreleased changes ) HADOOP-8720. TestLocalFileSystem should use test root subdirectory. (Vlad Rozov via eli) + HADOOP-8721. ZKFC should not retry 45 times when attempting a graceful + fence during a failover. (Vinayakumar B via atm) + BREAKDOWN OF HDFS-3042 SUBTASKS HADOOP-8220. ZKFailoverController doesn't handle failure to become active diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java index 879155b40c..fbcc3ed7fe 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java @@ -154,6 +154,11 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic { "ha.failover-controller.graceful-fence.rpc-timeout.ms"; public static final int HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT = 5000; + /* FC connection retries for graceful fencing */ + public static final String HA_FC_GRACEFUL_FENCE_CONNECTION_RETRIES = + "ha.failover-controller.graceful-fence.connection.retries"; + public static final int HA_FC_GRACEFUL_FENCE_CONNECTION_RETRIES_DEFAULT = 1; + /* Timeout that the CLI (manual) FC waits for monitorHealth, getServiceState */ public static final String HA_FC_CLI_CHECK_TIMEOUT_KEY = "ha.failover-controller.cli-check.rpc-timeout.ms"; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java index b1d2c7e181..d952e29381 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java @@ -49,16 +49,34 @@ public class FailoverController { private final int rpcTimeoutToNewActive; private final Configuration conf; + /* + * Need a copy of conf for graceful fence to set + * configurable retries for IPC client. + * Refer HDFS-3561 + */ + private final Configuration gracefulFenceConf; private final RequestSource requestSource; public FailoverController(Configuration conf, RequestSource source) { this.conf = conf; + this.gracefulFenceConf = new Configuration(conf); this.requestSource = source; this.gracefulFenceTimeout = getGracefulFenceTimeout(conf); this.rpcTimeoutToNewActive = getRpcTimeoutToNewActive(conf); + + //Configure less retries for graceful fence + int gracefulFenceConnectRetries = conf.getInt( + CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_CONNECTION_RETRIES, + CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_CONNECTION_RETRIES_DEFAULT); + gracefulFenceConf.setInt( + CommonConfigurationKeys.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, + gracefulFenceConnectRetries); + gracefulFenceConf.setInt( + CommonConfigurationKeys.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + gracefulFenceConnectRetries); } static int getGracefulFenceTimeout(Configuration conf) { @@ -150,7 +168,7 @@ private StateChangeRequestInfo createReqInfo() { boolean tryGracefulFence(HAServiceTarget svc) { HAServiceProtocol proxy = null; try { - proxy = svc.getProxy(conf, gracefulFenceTimeout); + proxy = svc.getProxy(gracefulFenceConf, gracefulFenceTimeout); proxy.transitionToStandby(createReqInfo()); return true; } catch (ServiceFailedException sfe) {