SLIDER-202: Chaos Monkey adds a (slow) minicluster test.
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/53028eaa Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/53028eaa Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/53028eaa Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry Commit: 53028eaae347899cef9e695e9ee8b41594c6377b Parents: 52bd11b Author: Steve Loughran <ste...@apache.org> Authored: Wed Aug 13 17:30:49 2014 +0100 Committer: Steve Loughran <ste...@apache.org> Committed: Wed Aug 13 17:30:49 2014 +0100 ---------------------------------------------------------------------- .../org/apache/slider/api/InternalKeys.java | 49 +++++++--- .../apache/slider/common/tools/SliderUtils.java | 2 +- .../server/appmaster/SliderAppMaster.java | 39 ++++---- .../TestStandaloneAMMonkeyRestart.groovy | 94 ++++++++++++++++++++ .../TestRegionServerFailureThreshold.groovy | 5 +- 5 files changed, 154 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/53028eaa/slider-core/src/main/java/org/apache/slider/api/InternalKeys.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/InternalKeys.java b/slider-core/src/main/java/org/apache/slider/api/InternalKeys.java index 4045c0e..ad384e2 100644 --- a/slider-core/src/main/java/org/apache/slider/api/InternalKeys.java +++ b/slider-core/src/main/java/org/apache/slider/api/InternalKeys.java @@ -84,31 +84,54 @@ public interface InternalKeys { * Flag to indicate whether or not the chaos monkey is enabled: * {@value} */ - String INTERNAL_CHAOS_MONKEY_ENABLED = "internal.chaos.monkey.enabled"; - boolean DEFAULT_INTERNAL_CHAOS_MONKEY_ENABLED = false; + String CHAOS_MONKEY_ENABLED = "internal.chaos.monkey.enabled"; + boolean DEFAULT_CHAOS_MONKEY_ENABLED = false; /** * Rate */ - String INTERNAL_CHAOS_MONKEY_RATE = "internal.chaos.monkey.rate"; - - int DEFAULT_INTERNAL_CHAOS_MONKEY_RATE_DAYS = 0; - int DEFAULT_INTERNAL_CHAOS_MONKEY_RATE_HOURS = 1; - int DEFAULT_INTERNAL_CHAOS_MONKEY_RATE_MINUTES = 0; + String CHAOS_MONKEY_INTERVAL = "internal.chaos.monkey.interval"; + String CHAOS_MONKEY_INTERVAL_DAYS = CHAOS_MONKEY_INTERVAL + ".days"; + String CHAOS_MONKEY_INTERVAL_HOURS = CHAOS_MONKEY_INTERVAL + ".hours"; + String CHAOS_MONKEY_INTERVAL_MINUTES = CHAOS_MONKEY_INTERVAL + ".minutes"; + String CHAOS_MONKEY_INTERVAL_SECONDS = CHAOS_MONKEY_INTERVAL + ".seconds"; - String INTERNAL_CHAOS_MONKEY_PROBABILITY = + int DEFAULT_CHAOS_MONKEY_INTERVAL_DAYS = 0; + int DEFAULT_CHAOS_MONKEY_INTERVAL_HOURS = 0; + int DEFAULT_CHAOS_MONKEY_INTERVAL_MINUTES = 0; + + /** + * Prefix for all chaos monkey probabilities + */ + String CHAOS_MONKEY_PROBABILITY = "internal.chaos.monkey.probability"; /** * Probabilies are out of 10000 ; 100==1% */ - String INTERNAL_CHAOS_MONKEY_PROBABILITY_AM_FAILURE = INTERNAL_CHAOS_MONKEY_PROBABILITY +".amfailure"; - int DEFAULT_CHAOS_MONKEY_PROBABILITY_AM_FAILURE = 10; - String INTERNAL_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE = - INTERNAL_CHAOS_MONKEY_PROBABILITY + ".containerfailure"; - int DEFAULT_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE = 100; + /** + * Probability of a monkey check killing the AM: {@value} + */ + String CHAOS_MONKEY_PROBABILITY_AM_FAILURE = CHAOS_MONKEY_PROBABILITY +".amfailure"; + + /** + * Default probability of a monkey check killing the AM: {@value} + */ + int DEFAULT_CHAOS_MONKEY_PROBABILITY_AM_FAILURE = 0; + + /** + * Probability of a monkey check killing a container: {@value} + */ + + String CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE = + CHAOS_MONKEY_PROBABILITY + ".containerfailure"; + + /** + * Default probability of a monkey check killing the a container: {@value} + */ + int DEFAULT_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE = 0; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/53028eaa/slider-core/src/main/java/org/apache/slider/common/tools/SliderUtils.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/common/tools/SliderUtils.java b/slider-core/src/main/java/org/apache/slider/common/tools/SliderUtils.java index d7f159c..7a80f24 100644 --- a/slider-core/src/main/java/org/apache/slider/common/tools/SliderUtils.java +++ b/slider-core/src/main/java/org/apache/slider/common/tools/SliderUtils.java @@ -1476,7 +1476,7 @@ public final class SliderUtils { } is = new ByteArrayInputStream(content); } else { - log.info("Size unknown. Reading {}", zipEntry.getName()); + log.debug("Size unknown. Reading {}", zipEntry.getName()); ByteArrayOutputStream baos = new ByteArrayOutputStream(); while (true) { int byteRead = zis.read(); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/53028eaa/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index 1642cc5..7825e24 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -779,17 +779,16 @@ public class SliderAppMaster extends AbstractSliderLaunchedService // now do the registration registerServiceInstance(clustername, appid); - sliderAMProvider.start(); + // chaos monkey + maybeStartMonkey(); + // Start the Slider AM provider + sliderAMProvider.start(); - - // launch the provider; this is expected to trigger a callback that + // launch the real provider; this is expected to trigger a callback that // starts the node review process launchProviderService(instanceDefinition, confDir); - // chaos monkey - maybeStartMonkey(); - try { //now block waiting to be told to exit the process waitForAMCompletionSignal(); @@ -1459,7 +1458,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService rmOperationHandler.execute(operations); } - /** * Get the RM operations handler for direct scheduling of work. */ @@ -1739,42 +1737,45 @@ public class SliderAppMaster extends AbstractSliderLaunchedService } signalAMComplete(exitCode, exception.toString()); } - - public boolean maybeStartMonkey() { + + /** + * Start the chaos monkey + * @return true if it started + */ + private boolean maybeStartMonkey() { MapOperations internals = getGlobalInternalOptions(); Boolean enabled = - internals.getOptionBool(InternalKeys.INTERNAL_CHAOS_MONKEY_ENABLED, - InternalKeys.DEFAULT_INTERNAL_CHAOS_MONKEY_ENABLED); + internals.getOptionBool(InternalKeys.CHAOS_MONKEY_ENABLED, + InternalKeys.DEFAULT_CHAOS_MONKEY_ENABLED); if (!enabled) { log.info("Chaos monkey disabled"); } long monkeyInterval = internals.getTimeRange( - InternalKeys.INTERNAL_CHAOS_MONKEY_RATE, - InternalKeys.DEFAULT_INTERNAL_CHAOS_MONKEY_RATE_DAYS, - InternalKeys.DEFAULT_INTERNAL_CHAOS_MONKEY_RATE_HOURS, - InternalKeys.DEFAULT_INTERNAL_CHAOS_MONKEY_RATE_MINUTES, + InternalKeys.CHAOS_MONKEY_INTERVAL, + InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_DAYS, + InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_HOURS, + InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_MINUTES, 0); log.info("Adding Chaos Monkey scheduled every {} seconds ({} hours)", monkeyInterval, monkeyInterval/(60*60)); monkey = new ChaosMonkeyService(metrics, actionQueues); int amKillProbability = internals.getOptionInt( - InternalKeys.INTERNAL_CHAOS_MONKEY_PROBABILITY_AM_FAILURE, + InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_FAILURE, InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_AM_FAILURE); if (amKillProbability > 0) { - log.info("Adding AM killer with probability %f", amKillProbability/100.0); monkey.addTarget("AM killer", new ChaosKillAM(actionQueues, -1), amKillProbability ); } int containerKillProbability = internals.getOptionInt( - InternalKeys.INTERNAL_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE, + InternalKeys.CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE, InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE); if (containerKillProbability > 0) { monkey.addTarget("Container killer", new ChaosKillContainer(appState, actionQueues, rmOperationHandler), - amKillProbability + containerKillProbability ); } initAndAddService(monkey); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/53028eaa/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMMonkeyRestart.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMMonkeyRestart.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMMonkeyRestart.groovy new file mode 100644 index 0000000..162bab0 --- /dev/null +++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMMonkeyRestart.groovy @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.slider.agent.standalone + +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j +import org.apache.hadoop.SleepJob +import org.apache.hadoop.yarn.api.records.ApplicationReport +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus +import org.apache.hadoop.yarn.api.records.YarnApplicationState +import org.apache.hadoop.yarn.conf.YarnConfiguration +import org.apache.slider.agent.AgentMiniClusterTestBase +import org.apache.slider.api.InternalKeys +import org.apache.slider.api.ResourceKeys +import org.apache.slider.client.SliderClient +import org.apache.slider.common.SliderXmlConfKeys +import org.apache.slider.common.params.ActionAMSuicideArgs +import org.apache.slider.common.params.Arguments +import org.apache.slider.core.exceptions.ErrorStrings +import org.apache.slider.core.main.ServiceLauncher +import org.junit.Test + +/** + * kill a masterless AM and verify it shuts down. This test + * also sets the retry count to 1 to stop recreation attempts + */ +@CompileStatic +@Slf4j + +class TestStandaloneAMMonkeyRestart extends AgentMiniClusterTestBase { + + + @Test + public void testStandaloneAMMonkeyRestart() throws Throwable { + describe "Run a Standalone AM with the Chaos monkey set to kill it" + // patch the configuration for AM restart + int threshold = 2; + YarnConfiguration conf = getRestartableConfiguration(threshold) + + String clustername = createMiniCluster("", conf, 1, true) + ServiceLauncher<SliderClient> launcher = + createStandaloneAMWithArgs(clustername, + [ + Arguments.ARG_OPTION, InternalKeys.CHAOS_MONKEY_ENABLED, "true", + Arguments.ARG_OPTION, InternalKeys.CHAOS_MONKEY_INTERVAL_SECONDS, "8", + Arguments.ARG_OPTION, InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_FAILURE, "75000", + ], + true, + false) + SliderClient sliderClient = launcher.service + addToTeardown(sliderClient); + + ApplicationReport report + report = waitForClusterLive(sliderClient, 30000) + describe "Waiting for the cluster to fail" + sleep(40000) + // end of process + report = sliderClient.applicationReport + log.info(report.diagnostics) + assert report.currentApplicationAttemptId.attemptId == threshold + assert YarnApplicationState.FAILED == report.yarnApplicationState + assert FinalApplicationStatus.FAILED == report.finalApplicationStatus + } + + /** + * Get a restartable configuration + * @param restarts + * @return + */ + public YarnConfiguration getRestartableConfiguration(int restarts) { + def conf = new YarnConfiguration(configuration) + conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, restarts) + conf.setInt(SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, restarts) + conf + } + + +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/53028eaa/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy ---------------------------------------------------------------------- diff --git a/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy b/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy index 1322fd3..b3aaa48 100644 --- a/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy +++ b/slider-providers/hbase/slider-hbase-provider/src/test/groovy/org/apache/slider/providers/hbase/minicluster/failures/TestRegionServerFailureThreshold.groovy @@ -149,8 +149,9 @@ class TestRegionServerFailureThreshold extends HBaseMiniClusterTestBase { } } } catch (BadClusterStateException e) { - assert e.toString().contains(ErrorStrings.E_APPLICATION_NOT_RUNNING) - assert e.exitCode == SliderExitCodes.EXIT_BAD_STATE + assertExceptionDetails(e, + SliderExitCodes.EXIT_BAD_STATE, + ErrorStrings.E_APPLICATION_NOT_RUNNING) } ApplicationReport report = client.applicationReport log.info(report.diagnostics)