YARN-2019. Retrospect on decision of making RM crashed if any exception throw in ZKRMStateStore. Contributed by Jian He.

This commit is contained in:
Junping Du 2015-07-22 17:52:35 -07:00
parent e91ccfad07
commit ee98d6354b
4 changed files with 37 additions and 2 deletions

View File

@ -144,6 +144,9 @@ Release 2.8.0 - UNRELEASED
YARN-2003. Support for Application priority : Changes in RM and Capacity YARN-2003. Support for Application priority : Changes in RM and Capacity
Scheduler. (Sunil G via wangda) Scheduler. (Sunil G via wangda)
YARN-2019. Retrospect on decision of making RM crashed if any exception throw
in ZKRMStateStore. (Jian He via junping_du)
IMPROVEMENTS IMPROVEMENTS
YARN-644. Basic null check is not performed on passed in arguments before YARN-644. Basic null check is not performed on passed in arguments before

View File

@ -401,6 +401,11 @@ private static void addDeprecatedKeys() {
public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled"; public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled";
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
public static final boolean DEFAULT_YARN_FAIL_FAST = true;
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
@Private @Private
public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX
+ "work-preserving-recovery.enabled"; + "work-preserving-recovery.enabled";
@ -2018,6 +2023,12 @@ public static boolean useHttps(Configuration conf) {
YARN_HTTP_POLICY_DEFAULT)); YARN_HTTP_POLICY_DEFAULT));
} }
public static boolean shouldRMFailFast(Configuration conf) {
return conf.getBoolean(YarnConfiguration.RM_FAIL_FAST,
conf.getBoolean(YarnConfiguration.YARN_FAIL_FAST,
YarnConfiguration.DEFAULT_YARN_FAIL_FAST));
}
@Private @Private
public static String getClusterId(Configuration conf) { public static String getClusterId(Configuration conf) {
String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID); String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID);

View File

@ -323,6 +323,22 @@
<value>false</value> <value>false</value>
</property> </property>
<property>
<description>Should RM fail fast if it encounters any errors. By defalt, it
points to ${yarn.fail-fast}. Errors include:
1) exceptions when state-store write/read operations fails.
</description>
<name>yarn.resourcemanager.fail-fast</name>
<value>${yarn.fail-fast}</value>
</property>
<property>
<description>Should YARN fail fast if it encounters any errors.
</description>
<name>yarn.fail-fast</name>
<value>true</value>
</property>
<property> <property>
<description>Enable RM work preserving recovery. This configuration is private <description>Enable RM work preserving recovery. This configuration is private
to YARN for experimenting the feature. to YARN for experimenting the feature.

View File

@ -44,6 +44,7 @@
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
@ -855,6 +856,7 @@ protected void handleStoreEvent(RMStateStoreEvent event) {
* @param failureCause the exception due to which the operation failed * @param failureCause the exception due to which the operation failed
*/ */
protected void notifyStoreOperationFailed(Exception failureCause) { protected void notifyStoreOperationFailed(Exception failureCause) {
LOG.error("State store operation failed ", failureCause);
if (failureCause instanceof StoreFencedException) { if (failureCause instanceof StoreFencedException) {
updateFencedState(); updateFencedState();
Thread standByTransitionThread = Thread standByTransitionThread =
@ -862,8 +864,11 @@ protected void notifyStoreOperationFailed(Exception failureCause) {
standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.setName("StandByTransitionThread Handler");
standByTransitionThread.start(); standByTransitionThread.start();
} else { } else {
rmDispatcher.getEventHandler().handle( if (YarnConfiguration.shouldRMFailFast(getConfig())) {
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause)); rmDispatcher.getEventHandler().handle(
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
failureCause));
}
} }
} }