YARN-2019. Retrospect on decision of making RM crashed if any exception throw in ZKRMStateStore. Contributed by Jian He.
(cherry picked from commit db57d91ac91e895bcb9a23fa50af0b2fbcb1db5a) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/d27f09c9 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/d27f09c9 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/d27f09c9 Branch: refs/heads/branch-2.6 Commit: d27f09c9369d9f2604eb31deaff50f4bd5aa98ac Parents: c09bb46 Author: Xuan <xg...@apache.org> Authored: Mon Sep 7 17:34:33 2015 -0700 Committer: Sangjin Lee <sj...@apache.org> Committed: Fri Sep 25 16:30:49 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 +++ .../apache/hadoop/yarn/conf/YarnConfiguration.java | 11 +++++++++++ .../src/main/resources/yarn-default.xml | 16 ++++++++++++++++ .../resourcemanager/recovery/RMStateStore.java | 9 +++++++-- 4 files changed, 37 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/d27f09c9/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 6d016b7..15d082b 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -6,6 +6,9 @@ Release 2.6.2 - UNRELEASED NEW FEATURES + YARN-2019. Retrospect on decision of making RM crashed if any exception throw + in ZKRMStateStore. (Jian He via junping_du) + IMPROVEMENTS YARN-4092. Fixed UI redirection to print useful messages when both RMs are http://git-wip-us.apache.org/repos/asf/hadoop/blob/d27f09c9/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 881ecb5..471297c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -380,6 +380,11 @@ public class YarnConfiguration extends Configuration { public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled"; public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; + public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast"; + public static final boolean DEFAULT_YARN_FAIL_FAST = true; + + public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast"; + @Private public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX + "work-preserving-recovery.enabled"; @@ -1568,6 +1573,12 @@ public class YarnConfiguration extends Configuration { YARN_HTTP_POLICY_DEFAULT)); } + public static boolean shouldRMFailFast(Configuration conf) { + return conf.getBoolean(YarnConfiguration.RM_FAIL_FAST, + conf.getBoolean(YarnConfiguration.YARN_FAIL_FAST, + YarnConfiguration.DEFAULT_YARN_FAIL_FAST)); + } + @Private public static String getClusterId(Configuration conf) { String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID); http://git-wip-us.apache.org/repos/asf/hadoop/blob/d27f09c9/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 2ffa8b8..fa2e5cc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -276,6 +276,22 @@ </property> <property> + <description>Should RM fail fast if it encounters any errors. By defalt, it + points to ${yarn.fail-fast}. Errors include: + 1) exceptions when state-store write/read operations fails. + </description> + <name>yarn.resourcemanager.fail-fast</name> + <value>${yarn.fail-fast}</value> + </property> + + <property> + <description>Should YARN fail fast if it encounters any errors. + </description> + <name>yarn.fail-fast</name> + <value>true</value> + </property> + + <property> <description>Enable RM work preserving recovery. This configuration is private to YARN for experimenting the feature. </description> http://git-wip-us.apache.org/repos/asf/hadoop/blob/d27f09c9/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index 983cc81..37855f7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -42,6 +42,7 @@ import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.EventHandler; @@ -820,14 +821,18 @@ public abstract class RMStateStore extends AbstractService { * @param failureCause the exception due to which the operation failed */ protected void notifyStoreOperationFailed(Exception failureCause) { + LOG.error("State store operation failed ", failureCause); if (failureCause instanceof StoreFencedException) { Thread standByTransitionThread = new Thread(new StandByTransitionThread()); standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.start(); } else { - rmDispatcher.getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause)); + if (YarnConfiguration.shouldRMFailFast(getConfig())) { + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, + failureCause)); + } } }