SLIDER-310 failure threshold can be set per component, which overrides the global value
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/564462e8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/564462e8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/564462e8 Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry Commit: 564462e89fcc80b6214585491ce712d65edd1bed Parents: 2f8bda2 Author: Steve Loughran <ste...@apache.org> Authored: Thu Aug 14 12:37:05 2014 +0100 Committer: Steve Loughran <ste...@apache.org> Committed: Thu Aug 14 13:19:27 2014 +0100 ---------------------------------------------------------------------- .../slider/server/appmaster/state/AppState.java | 17 +++++++++-- .../TestRegionServerFailureThreshold.groovy | 31 ++++++++++++++++---- 2 files changed, 40 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/564462e8/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java index d616678..d22c716 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java @@ -1480,7 +1480,8 @@ public class AppState { throw new TriggerClusterTeardownException( SliderExitCodes.EXIT_DEPLOYMENT_FAILED, ErrorStrings.E_UNSTABLE_CLUSTER + - " - failed with role %s failing %d times (%d in startup); threshold is %d - last failure: %s", + " - failed with role %s failing %d times (%d in startup);" + + " threshold is %d - last failure: %s", role.getName(), role.getFailed(), role.getStartFailed(), @@ -1489,8 +1490,18 @@ public class AppState { } } - private int getFailureThresholdForRole(RoleStatus status) { - return failureThreshold; + /** + * Get the failure threshold for a specific role, falling back to + * the global one if not + * @param roleStatus + * @return the threshold for failures + */ + private int getFailureThresholdForRole(RoleStatus roleStatus) { + ConfTreeOperations resources = + instanceDefinition.getResourceOperations(); + return resources.getComponentOptInt(roleStatus.getName(), + ResourceKeys.CONTAINER_FAILURE_SHORTLIFE, + failureThreshold); } /** http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/564462e8/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy ---------------------------------------------------------------------- diff --git a/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy b/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy index b3aaa48..eb44ae0 100644 --- a/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy +++ b/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy @@ -34,6 +34,8 @@ import org.apache.slider.client.SliderClient import org.apache.slider.providers.hbase.minicluster.HBaseMiniClusterTestBase import org.junit.Test +import static org.apache.slider.providers.hbase.HBaseKeys.ROLE_WORKER + /** * test that if a container is killed too many times, * the AM stays down @@ -44,10 +46,17 @@ import org.junit.Test class TestRegionServerFailureThreshold extends HBaseMiniClusterTestBase { @Test - public void testFailedRegionService() throws Throwable { + public void testRegionServerFailureThreshold() throws Throwable { failureThresholdTestRun("", true, 2, 5) } + /** + * Sets the failure threshold then runs the #of kill attempts + * @param testName + * @param toKill + * @param threshold + * @param killAttempts + */ private void failureThresholdTestRun( String testName, boolean toKill, @@ -57,16 +66,23 @@ class TestRegionServerFailureThreshold extends HBaseMiniClusterTestBase { int regionServerCount = 1 String clustername = createMiniCluster(testName, configuration, 1, 1, 1, true, true) describe( - "Create a single region service HBase instance then " + action + " the RS"); + "Create a single region service HBase instance" + + "then $action the RS $killAttempts times with a threshold of $threshold"); //now launch the cluster + def globalThreshold = threshold - 1 ServiceLauncher<SliderClient> launcher = createHBaseCluster( clustername, regionServerCount, [ + Arguments.ARG_RES_COMP_OPT, + ROLE_WORKER, + ResourceKeys.CONTAINER_FAILURE_THRESHOLD, + Integer.toString(threshold), + Arguments.ARG_RESOURCE_OPT, ResourceKeys.CONTAINER_FAILURE_THRESHOLD, - Integer.toString(threshold) + Integer.toString(globalThreshold) ], true, true) @@ -74,9 +90,14 @@ class TestRegionServerFailureThreshold extends HBaseMiniClusterTestBase { addToTeardown(client); def aggregateConf = client.loadPersistedClusterDescription(clustername) log.info aggregateConf.toString() - def failureOptValue = aggregateConf.resourceOperations.globalOptions.getMandatoryOptionInt( + + def resourceOperations = aggregateConf.resourceOperations + def failureOptValue = resourceOperations.globalOptions.getMandatoryOptionInt( ResourceKeys.CONTAINER_FAILURE_THRESHOLD) - assert threshold == failureOptValue + assert globalThreshold == failureOptValue + def workerThreshold = resourceOperations.getComponentOptInt(ROLE_WORKER, + ResourceKeys.CONTAINER_FAILURE_THRESHOLD, 0) + assert threshold == workerThreshold ClusterDescription status = client.getClusterDescription(clustername) ClusterStatus clustat = basicHBaseClusterStartupSequence(client)