Repository: incubator-slider Updated Branches: refs/heads/develop 0f436c865 -> cc7a644ea
SLIDER-1250 Tests for Health Threshold Monitoring Feature (SLIDER-1246) Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/cc7a644e Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/cc7a644e Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/cc7a644e Branch: refs/heads/develop Commit: cc7a644ea967e86be267c7cf7519d1d7402c60b2 Parents: 0f436c8 Author: Gour Saha <gourks...@apache.org> Authored: Wed Oct 4 00:52:39 2017 -0700 Committer: Gour Saha <gourks...@apache.org> Committed: Wed Oct 4 00:53:34 2017 -0700 ---------------------------------------------------------------------- .../resources_health_monitor_60.json | 22 + .../resources_health_monitor_80.json | 22 + ...urces_health_monitor_lots_of_containers.json | 18 + .../resources_health_monitor_uniq_names_60.json | 23 ++ .../resources_health_monitor_uniq_names_80.json | 23 ++ .../funtest/framework/CommandTestBase.groovy | 1 + .../apache/slider/funtest/ResourcePaths.groovy | 10 + .../lifecycle/AppsHealthMonitorIT.groovy | 408 +++++++++++++++++++ 8 files changed, 527 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json ---------------------------------------------------------------------- diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json new file mode 100644 index 0000000..53c1feb --- /dev/null +++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json @@ -0,0 +1,22 @@ +{ + "schema": "http://example.org/specification/v2.0.0", + "metadata": { + }, + "global": { + }, + "components": { + "COMMAND_LOGGER": { + "yarn.memory": "128", + "yarn.role.priority": "1", + "yarn.component.instances": "3", + "yarn.container.health.threshold.percent" : "60", + "yarn.container.health.threshold.window.secs" : "5", + "yarn.container.health.threshold.init.delay.secs" : "1", + "yarn.container.health.threshold.poll.frequency.secs" : "2", + "yarn.node.failure.threshold" : "1000" + }, + "slider-appmaster": { + "yarn.memory": "512" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json ---------------------------------------------------------------------- diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json new file mode 100644 index 0000000..b65bd23 --- /dev/null +++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json @@ -0,0 +1,22 @@ +{ + "schema": "http://example.org/specification/v2.0.0", + "metadata": { + }, + "global": { + }, + "components": { + "COMMAND_LOGGER": { + "yarn.memory": "128", + "yarn.role.priority": "1", + "yarn.component.instances": "3", + "yarn.container.health.threshold.percent" : "80", + "yarn.container.health.threshold.window.secs" : "5", + "yarn.container.health.threshold.init.delay.secs" : "1", + "yarn.container.health.threshold.poll.frequency.secs" : "2", + "yarn.node.failure.threshold" : "1000" + }, + "slider-appmaster": { + "yarn.memory": "512" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json ---------------------------------------------------------------------- diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json new file mode 100644 index 0000000..0e22b25 --- /dev/null +++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json @@ -0,0 +1,18 @@ +{ + "schema": "http://example.org/specification/v2.0.0", + "metadata": { + }, + "global": { + }, + "components": { + "COMMAND_LOGGER": { + "yarn.memory": "128", + "yarn.role.priority": "1", + "yarn.component.instances": "10000", + "yarn.node.failure.threshold" : "1000" + }, + "slider-appmaster": { + "yarn.memory": "512" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json ---------------------------------------------------------------------- diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json new file mode 100644 index 0000000..c42fbec --- /dev/null +++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json @@ -0,0 +1,23 @@ +{ + "schema": "http://example.org/specification/v2.0.0", + "metadata": { + }, + "global": { + }, + "components": { + "COMMAND_LOGGER": { + "yarn.memory": "128", + "yarn.role.priority": "1", + "component.unique.names" : "true", + "yarn.component.instances": "3", + "yarn.container.health.threshold.percent" : "60", + "yarn.container.health.threshold.window.secs" : "5", + "yarn.container.health.threshold.init.delay.secs" : "1", + "yarn.container.health.threshold.poll.frequency.secs" : "2", + "yarn.node.failure.threshold" : "1000" + }, + "slider-appmaster": { + "yarn.memory": "512" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json ---------------------------------------------------------------------- diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json new file mode 100644 index 0000000..f7b37b1 --- /dev/null +++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json @@ -0,0 +1,23 @@ +{ + "schema": "http://example.org/specification/v2.0.0", + "metadata": { + }, + "global": { + }, + "components": { + "COMMAND_LOGGER": { + "yarn.memory": "128", + "yarn.role.priority": "1", + "component.unique.names" : "true", + "yarn.component.instances": "3", + "yarn.container.health.threshold.percent" : "80", + "yarn.container.health.threshold.window.secs" : "5", + "yarn.container.health.threshold.init.delay.secs" : "1", + "yarn.container.health.threshold.poll.frequency.secs" : "2", + "yarn.node.failure.threshold" : "1000" + }, + "slider-appmaster": { + "yarn.memory": "512" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy ---------------------------------------------------------------------- diff --git a/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy b/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy index 0eb7541..81dba29 100644 --- a/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy +++ b/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy @@ -375,6 +375,7 @@ abstract class CommandTestBase extends SliderTestUtils { [ ACTION_KILL_CONTAINER, name, + ARG_ID, containerID ]) } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy ---------------------------------------------------------------------- diff --git a/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy b/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy index 37503d9..e128cd6 100644 --- a/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy +++ b/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy @@ -27,6 +27,16 @@ interface ResourcePaths { String COMMAND_LOG_RESOURCES = "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources.json" String COMMAND_LOG_RESOURCES_QUEUE_LABELS = "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_queue_labels.json" String COMMAND_LOG_RESOURCES_NO_ROLE = "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_no_role.json" + String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_60 = + "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_60.json" + String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_80 = + "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_80.json" + String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_60 = + "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_uniq_names_60.json" + String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_80 = + "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_uniq_names_80.json" + String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_LOTS_OF_CONTAINERS = + "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_lots_of_containers.json" String COMMAND_LOG_APPCONFIG_NO_HB = "$SLIDER_CORE_APP_PACKAGES/test_command_log/appConfig_no_hb.json" String COMMAND_LOG_APPCONFIG_FAST_NO_REG = "$SLIDER_CORE_APP_PACKAGES/test_command_log/appConfig_fast_no_reg.json" http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy ---------------------------------------------------------------------- diff --git a/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy b/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy new file mode 100644 index 0000000..1c072b2 --- /dev/null +++ b/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.slider.funtest.lifecycle + +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j + +import org.apache.curator.utils.EnsurePath +import org.apache.hadoop.yarn.api.records.YarnApplicationState +import org.apache.slider.api.ClusterDescription +import org.apache.slider.api.StatusKeys +import org.apache.slider.common.SliderExitCodes +import org.apache.slider.common.params.Arguments +import org.apache.slider.common.params.SliderActions +import org.apache.slider.funtest.ResourcePaths +import org.apache.slider.funtest.framework.AgentCommandTestBase +import org.apache.slider.funtest.framework.FuntestProperties +import org.apache.slider.funtest.framework.SliderShell +import org.junit.After +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.Parameterized +import org.junit.runners.Parameterized.Parameter +import org.junit.runners.Parameterized.Parameters + +import java.util.Arrays +import java.util.Collection + +/** + * These are the steps required for the Health Monitor tests - + * - Install an app package + * - Create an app A with 3 containers, 60% health threshold, 5 sec health + * window, 2 secs poll frequency, and 1 secs init delay. Node failure + * threshold is kept high at 1000 to prevent it to interfere with these tests. + * - Create another app B with lots of containers (10K say), which will + * potentially eat up all the remaining resource in the default queue. Note, + * the idea is, that YARN will not be able to fulfil all the 10K container + * requests and hence a bunch of requests will be in Outstanding state. + * - Then test the following scenarios: + * > Kill one container of the app A. YARN will immediately allocate a + * container to app B since it had Outstanding container requests ahead of + * app A. So YARN will not be able to satisfy the one container request for + * app A. Health of app A will come down to 66.67%, but it should continue + * to run beyond the health window (5 secs) expiry, since it is above the + * threshold of 60%. + * + * - Create an app A with 3 containers, 80% health threshold, 5 sec health + * window, 2 secs poll frequency, and 1 secs init delay. Node failure + * threshold is kept high at 1000 to prevent it to interfere with these tests. + * - Create app B with same specs as the previous test + * - Then test the following scenarios: + * > Kill one container of the app A. YARN will immediately allocate a + * container to app B since it had Outstanding container requests ahead of + * app A. So YARN will not be able to satisfy the one container request for + * app A. Health of app A will come down to 66.67%, so after the health + * window (5 secs) expiry it should be killed, since it is below threshold + * of 80%. + * + * - Create an app A with 3 containers, 80% health threshold, 5 sec health + * window, 2 secs poll frequency, and 1 secs init delay. Node failure + * threshold is kept high at 1000 to prevent it to interfere with these tests. + * - Create app B with same specs as the previous test + * - Then test the following scenarios: + * > Kill one container of the app A. YARN will immediately allocate a + * container to app B since it had Outstanding container requests ahead of + * app A. So YARN will not be able to satisfy the one container request for + * app A. Health of app A will come down to 66.67%, so if the health + * window (5 secs) expires it will be killed as proven by the previous test. + * However in this test before the window expires we do a flex down of the + * role which brings the total containers required to 2 and hence current + * health goes back to 100%. As a result app A does not killed and continues + * to run even beyond the health window expiry. + * + * - Repeat all the above 3 test scenarios but this time with app A having + * unique component names enabled. So a total of 6 unique tests are run in + * this suite. + * + * - Note: This is a lengthy test-suite. Each test takes approx 2-3 mins, so all + * 6 tests in this suite takes approx 12-15 mins to run. Health monitor + * needs multiple success and failure simulations with appropriate + * window for each step to expire and subsequently validated for + * expected results. + */ +@RunWith(Parameterized.class) +@CompileStatic +@Slf4j +public class AppsHealthMonitorIT extends AgentCommandTestBase + implements FuntestProperties, Arguments, SliderExitCodes, SliderActions { + private static String COMMAND_LOGGER = "COMMAND_LOGGER" + private static String APPLICATION_NAME_60 = "app-health-monitor-60" + private static String APPLICATION_NAME_80 = "app-health-monitor-80" + private static String APPLICATION_NAME_LOTS_OF_CONTAINERS = + "app-health-monitor-lots-of-containers" + private static String APP_RESOURCE_60 = + ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_60 + private static String APP_RESOURCE_80 = + ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_80 + private static String APP_RESOURCE_UNIQUE_NAMES_60 = + ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_60 + private static String APP_RESOURCE_UNIQUE_NAMES_80 = + ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_80 + private static String APP_RESOURCE_LOTS_OF_CONTAINERS = + ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_LOTS_OF_CONTAINERS + + @Parameter + public Boolean isUniqueComp + @Parameter(1) + public String appResourceFor60 + @Parameter(2) + public String appResourceFor80 + + @Parameters + public static Collection<Object[]> data() { + Object[] testRun1 = [Boolean.FALSE, APP_RESOURCE_60, APP_RESOURCE_80] + Object[] testRun2 = [Boolean.TRUE, APP_RESOURCE_UNIQUE_NAMES_60, + APP_RESOURCE_UNIQUE_NAMES_80] + Object[][] data = [testRun1, testRun2] + return Arrays.asList(data); + } + + @After + public void destroyCluster() { + def appName60 = APPLICATION_NAME_60 + def appName80 = APPLICATION_NAME_80 + if (isUniqueComp) { + appName60 += "-uniq-comp" + appName80 += "-uniq-comp" + } + cleanup(appName60) + cleanup(appName80) + cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS) + } + + @Test + public void testHealthMonitorAppRunning() throws Throwable { + describe("Running testHealthMonitorAppRunning for apps with resources " + + appResourceFor60 + " and " + appResourceFor80 + " with unique comp = " + + isUniqueComp) + assumeAgentTestsEnabled() + def appName = APPLICATION_NAME_60 + if (isUniqueComp) { + appName += "-uniq-comp" + } + cleanup(appName) + cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS) + + File launchReportFile = createTempJsonFile(); + SliderShell shell = createTemplatedSliderApplication( + appName, + APP_TEMPLATE, + appResourceFor60, + [], + launchReportFile) + logShell(shell) + + def appId = ensureYarnApplicationIsUp(launchReportFile) + if (isUniqueComp) { + expectContainerRequestedCountReached(appName, COMMAND_LOGGER + "1", 1, + CONTAINER_LAUNCH_TIMEOUT) + assertContainersLive(appName, COMMAND_LOGGER + "1", 1) + assertContainersLive(appName, COMMAND_LOGGER + "2", 1) + assertContainersLive(appName, COMMAND_LOGGER + "3", 1) + } else { + expectContainerRequestedCountReached(appName, COMMAND_LOGGER, 3, + CONTAINER_LAUNCH_TIMEOUT) + assertContainersLive(appName, COMMAND_LOGGER, 3) + } + + // Wait for 2 secs to get past the init delay and let the health monitor + // polling to start + describe("Wait for 2 secs to let the health monitor polling to start") + sleep(1000 * 2) + + // Now bring up an app which will eat up all the remaining resources of the + // default queue of the cluster and ensure it is up and running. Currently + // it has 10,000 containers which is about 1.28TB of memory. Note, if this + // test is executed in a queue with more than 1.28TB of memory (very slim + // chance), then it will very likely fail. + File launchReportFileLotsOfContainers = createTempJsonFile(); + shell = createTemplatedSliderApplication( + APPLICATION_NAME_LOTS_OF_CONTAINERS, + APP_TEMPLATE, + APP_RESOURCE_LOTS_OF_CONTAINERS, + [], + launchReportFileLotsOfContainers) + logShell(shell) + + def appIdLotsOfContainers = + ensureYarnApplicationIsUp(launchReportFileLotsOfContainers) + // Wait for 10 secs to let the containers come up (until no more resource is + // left in the default queue) + describe("Wait 10 secs to let containers come up and eat up all the memory") + sleep(1000 * 10) + + // kill one container which will bring health down to about 66.67% but app + // should continue to run, since threshold is 60% + ClusterDescription cd = execStatus(appName) + String containerId; + if (isUniqueComp) { + containerId = cd.instances.get(COMMAND_LOGGER + "3").get(0) + } else { + containerId = cd.instances.get(COMMAND_LOGGER).get(0) + } + describe("Killing container " + containerId) + killContainer(appName, containerId) + + describe("Wait for 10 secs to ensure no container was allocated even after " + + "expiry of health window, but then the app should continue to run") + sleep(1000 * 10) + ensureYarnApplicationIsUp(appId) + // Also assert that only 2 containers are running + if (isUniqueComp) { + assertContainersLive(appName, COMMAND_LOGGER + "1", 1) + assertContainersLive(appName, COMMAND_LOGGER + "2", 1) + assertContainersLive(appName, COMMAND_LOGGER + "3", 0) + } else { + assertContainersLive(appName, COMMAND_LOGGER, 2) + } + } + + @Test + public void testHealthMonitorAppStopped() throws Throwable { + describe("Running testHealthMonitorAppStopped for apps with resources " + + appResourceFor60 + " and " + appResourceFor80 + " with unique comp = " + + isUniqueComp) + assumeAgentTestsEnabled() + def appName = APPLICATION_NAME_80 + if (isUniqueComp) { + appName += "-uniq-comp" + } + cleanup(appName) + cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS) + + File launchReportFile = createTempJsonFile(); + SliderShell shell = createTemplatedSliderApplication( + appName, + APP_TEMPLATE, + appResourceFor80, + [], + launchReportFile) + logShell(shell) + + def appId = ensureYarnApplicationIsUp(launchReportFile) + if (isUniqueComp) { + expectContainerRequestedCountReached(appName, COMMAND_LOGGER + "1", 1, + CONTAINER_LAUNCH_TIMEOUT) + assertContainersLive(appName, COMMAND_LOGGER + "1", 1) + assertContainersLive(appName, COMMAND_LOGGER + "2", 1) + assertContainersLive(appName, COMMAND_LOGGER + "3", 1) + } else { + expectContainerRequestedCountReached(appName, COMMAND_LOGGER, 3, + CONTAINER_LAUNCH_TIMEOUT) + assertContainersLive(appName, COMMAND_LOGGER, 3) + } + + // Wait for 2 secs to get past the init delay and let the health monitor + // polling to start + describe("Wait for 2 secs to let the health monitor polling to start") + sleep(1000 * 2) + + // Now bring up app B + File launchReportFileLotsOfContainers = createTempJsonFile(); + shell = createTemplatedSliderApplication( + APPLICATION_NAME_LOTS_OF_CONTAINERS, + APP_TEMPLATE, + APP_RESOURCE_LOTS_OF_CONTAINERS, + [], + launchReportFileLotsOfContainers) + logShell(shell) + + def appIdLotsOfContainers = + ensureYarnApplicationIsUp(launchReportFileLotsOfContainers) + // Wait for 10 secs to let the containers come up (until no more resource is + // left in the default queue) + describe("Wait 10 secs to let containers come up and eat up all the memory") + sleep(1000 * 10) + + // kill one container which will bring health down to about 66.67% and app + // should be shutdown after health window expires, since threshold is 80% + ClusterDescription cd = execStatus(appName) + String containerId; + if (isUniqueComp) { + containerId = cd.instances.get(COMMAND_LOGGER + "3").get(0) + } else { + containerId = cd.instances.get(COMMAND_LOGGER).get(0) + } + describe("Killing container " + containerId) + killContainer(appName, containerId) + + describe("Wait 10 secs to give sufficient time for the app to be stopped") + sleep(1000 * 10) + if (isApplicationUp(appName)) { + fail("Application should have been shutdown, but is still running") + } + } + + @Test + public void testHealthMonitorAppSavedByFlex() throws Throwable { + describe("Running testHealthMonitorAppSavedByFlex for apps with resources " + + appResourceFor60 + " and " + appResourceFor80 + " with unique comp = " + + isUniqueComp) + assumeAgentTestsEnabled() + def appName = APPLICATION_NAME_80 + if (isUniqueComp) { + appName += "-uniq-comp" + } + cleanup(appName) + cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS) + + File launchReportFile = createTempJsonFile(); + SliderShell shell = createTemplatedSliderApplication( + appName, + APP_TEMPLATE, + appResourceFor80, + [], + launchReportFile) + logShell(shell) + + def appId = ensureYarnApplicationIsUp(launchReportFile) + if (isUniqueComp) { + expectContainerRequestedCountReached(appName, COMMAND_LOGGER + "1", 1, + CONTAINER_LAUNCH_TIMEOUT) + assertContainersLive(appName, COMMAND_LOGGER + "1", 1) + assertContainersLive(appName, COMMAND_LOGGER + "2", 1) + assertContainersLive(appName, COMMAND_LOGGER + "3", 1) + } else { + expectContainerRequestedCountReached(appName, COMMAND_LOGGER, 3, + CONTAINER_LAUNCH_TIMEOUT) + assertContainersLive(appName, COMMAND_LOGGER, 3) + } + + // Wait for 2 secs to get past the init delay and let the health monitor + // polling to start + describe("Wait for 2 secs to let the health monitor polling to start") + sleep(1000 * 2) + + // Now bring up app B + File launchReportFileLotsOfContainers = createTempJsonFile(); + shell = createTemplatedSliderApplication( + APPLICATION_NAME_LOTS_OF_CONTAINERS, + APP_TEMPLATE, + APP_RESOURCE_LOTS_OF_CONTAINERS, + [], + launchReportFileLotsOfContainers) + logShell(shell) + + def appIdLotsOfContainers = + ensureYarnApplicationIsUp(launchReportFileLotsOfContainers) + // Wait for 10 secs to let the containers come up (until no more resource is + // left in the default queue) + describe("Wait 10 secs to let containers come up and eat up all the memory") + sleep(1000 * 10) + + // kill one container which will bring health down to about 66.67% and app + // could be shutdown if health window expires, since threshold is 80% + ClusterDescription cd = execStatus(appName) + String containerId; + if (isUniqueComp) { + containerId = cd.instances.get(COMMAND_LOGGER + "3").get(0) + } else { + containerId = cd.instances.get(COMMAND_LOGGER).get(0) + } + describe("Killing container " + containerId) + killContainer(appName, containerId) + + // Before the health window expires, let's do a flex down to bring the + // health above threshold and prevent the app from being killed. Let's not + // do any additional checks after the kill container. + describe("Flexing down by 1 container") + slider(EXIT_SUCCESS, + [ + ACTION_FLEX, + appName, + ARG_COMPONENT, + COMMAND_LOGGER, + "-1" + ]) + + describe("Wait for 10 secs to give sufficient time for the health window " + + "to expire, and the app should continue to run") + sleep(1000 * 10) + ensureYarnApplicationIsUp(appId) + // Now assert that only 2 containers are running + if (isUniqueComp) { + // note, after flex down the role 3 does not even exist + assertContainersLive(appName, COMMAND_LOGGER + "1", 1) + assertContainersLive(appName, COMMAND_LOGGER + "2", 1) + } else { + assertContainersLive(appName, COMMAND_LOGGER, 2) + } + } +}