SLIDER-310 failure threshold can be set per component, which overrides the 
global value


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/564462e8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/564462e8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/564462e8

Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry
Commit: 564462e89fcc80b6214585491ce712d65edd1bed
Parents: 2f8bda2
Author: Steve Loughran <ste...@apache.org>
Authored: Thu Aug 14 12:37:05 2014 +0100
Committer: Steve Loughran <ste...@apache.org>
Committed: Thu Aug 14 13:19:27 2014 +0100

----------------------------------------------------------------------
 .../slider/server/appmaster/state/AppState.java | 17 +++++++++--
 .../TestRegionServerFailureThreshold.groovy     | 31 ++++++++++++++++----
 2 files changed, 40 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/564462e8/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index d616678..d22c716 100644
--- 
a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ 
b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -1480,7 +1480,8 @@ public class AppState {
       throw new TriggerClusterTeardownException(
         SliderExitCodes.EXIT_DEPLOYMENT_FAILED,
         ErrorStrings.E_UNSTABLE_CLUSTER +
-        " - failed with role %s failing %d times (%d in startup); threshold is 
%d - last failure: %s",
+        " - failed with role %s failing %d times (%d in startup);" +
+        " threshold is %d - last failure: %s",
         role.getName(),
         role.getFailed(),
         role.getStartFailed(),
@@ -1489,8 +1490,18 @@ public class AppState {
     }
   }
 
-  private int getFailureThresholdForRole(RoleStatus status) {
-    return failureThreshold;
+  /**
+   * Get the failure threshold for a specific role, falling back to
+   * the global one if not
+   * @param roleStatus
+   * @return the threshold for failures
+   */
+  private int getFailureThresholdForRole(RoleStatus roleStatus) {
+    ConfTreeOperations resources =
+        instanceDefinition.getResourceOperations();
+    return resources.getComponentOptInt(roleStatus.getName(),
+        ResourceKeys.CONTAINER_FAILURE_SHORTLIFE,
+        failureThreshold);
   }
   
   /**

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/564462e8/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy
----------------------------------------------------------------------
diff --git 
a/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy
 
b/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy
index b3aaa48..eb44ae0 100644
--- 
a/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy
+++ 
b/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy
@@ -34,6 +34,8 @@ import org.apache.slider.client.SliderClient
 import org.apache.slider.providers.hbase.minicluster.HBaseMiniClusterTestBase
 import org.junit.Test
 
+import static org.apache.slider.providers.hbase.HBaseKeys.ROLE_WORKER
+
 /**
  * test that if a container is killed too many times,
  * the AM stays down
@@ -44,10 +46,17 @@ import org.junit.Test
 class TestRegionServerFailureThreshold extends HBaseMiniClusterTestBase {
 
   @Test
-  public void testFailedRegionService() throws Throwable {
+  public void testRegionServerFailureThreshold() throws Throwable {
     failureThresholdTestRun("", true, 2, 5)
   }
 
+  /**
+   * Sets the failure threshold then runs the #of kill attempts
+   * @param testName
+   * @param toKill
+   * @param threshold
+   * @param killAttempts
+   */
   private void failureThresholdTestRun(
       String testName,
       boolean toKill,
@@ -57,16 +66,23 @@ class TestRegionServerFailureThreshold extends 
HBaseMiniClusterTestBase {
     int regionServerCount = 1
     String clustername = createMiniCluster(testName, configuration, 1, 1, 1, 
true, true)
     describe(
-        "Create a single region service HBase instance then " + action + " the 
RS");
+        "Create a single region service HBase instance" +
+        "then $action the RS $killAttempts times with a threshold of 
$threshold");
 
     //now launch the cluster
+    def globalThreshold = threshold - 1
     ServiceLauncher<SliderClient> launcher = createHBaseCluster(
         clustername,
         regionServerCount,
         [
+            Arguments.ARG_RES_COMP_OPT,
+            ROLE_WORKER,
+            ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
+            Integer.toString(threshold),
+
             Arguments.ARG_RESOURCE_OPT, 
             ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
-            Integer.toString(threshold)
+            Integer.toString(globalThreshold)
         ],
         true,
         true)
@@ -74,9 +90,14 @@ class TestRegionServerFailureThreshold extends 
HBaseMiniClusterTestBase {
     addToTeardown(client);
     def aggregateConf = client.loadPersistedClusterDescription(clustername)
     log.info aggregateConf.toString()
-    def failureOptValue = 
aggregateConf.resourceOperations.globalOptions.getMandatoryOptionInt(
+
+    def resourceOperations = aggregateConf.resourceOperations
+    def failureOptValue = 
resourceOperations.globalOptions.getMandatoryOptionInt(
         ResourceKeys.CONTAINER_FAILURE_THRESHOLD)
-    assert threshold == failureOptValue
+    assert globalThreshold == failureOptValue
+    def workerThreshold = resourceOperations.getComponentOptInt(ROLE_WORKER,
+        ResourceKeys.CONTAINER_FAILURE_THRESHOLD, 0)
+    assert threshold == workerThreshold
     ClusterDescription status = client.getClusterDescription(clustername)
 
     ClusterStatus clustat = basicHBaseClusterStartupSequence(client)

Reply via email to