Repository: incubator-slider Updated Branches: refs/heads/develop 922439e58 -> 39e04e36e
SLIDER-743. Include node failure history when choosing placement hints Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/39e04e36 Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/39e04e36 Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/39e04e36 Branch: refs/heads/develop Commit: 39e04e36ed72e8a40434e82db0d7f0f210f26f1d Parents: 922439e Author: Sumit Mohanty <smoha...@hortonworks.com> Authored: Fri Jan 30 16:54:26 2015 -0800 Committer: Sumit Mohanty <smoha...@hortonworks.com> Committed: Fri Jan 30 16:54:26 2015 -0800 ---------------------------------------------------------------------- .../org/apache/slider/api/ResourceKeys.java | 13 ++++++++- .../apache/slider/providers/ProviderRole.java | 11 ++++++-- .../slideram/SliderAMClientProvider.java | 3 +- .../server/appmaster/SliderAppMaster.java | 1 - .../slider/server/appmaster/state/AppState.java | 26 ++++++++++++++++-- .../appmaster/state/OutstandingRequest.java | 20 ++++++++++++-- .../server/appmaster/state/RoleHistory.java | 4 +-- .../server/appmaster/state/RoleStatus.java | 9 ++++++ .../TestMockAppStateDynamicRoles.groovy | 9 ++++++ .../TestRoleHistoryRequestTracking.groovy | 29 +++++++++++++++++++- 10 files changed, 110 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java index 52633f4..50ca82f 100644 --- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java +++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java @@ -100,7 +100,11 @@ public interface ResourceKeys { */ String COMPONENT_PLACEMENT_POLICY = "yarn.component.placement.policy"; - + /** + * Maximum number of node failures that can be tolerated by a component on a specific node + */ + String NODE_FAILURE_THRESHOLD = + "yarn.node.failure.threshold"; /** * maximum number of failed containers (in a single role) @@ -130,6 +134,13 @@ public interface ResourceKeys { int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 5; /** + * Default node failure threshold for a component instance: {@value} + * Should to be lower than default component failure threshold to allow + * the component to start elsewhere + */ + int DEFAULT_NODE_FAILURE_THRESHOLD = 3; + + /** * Log aggregation include, exclude patterns */ String YARN_LOG_INCLUDE_PATTERNS = "yarn.log.include.patterns"; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java index 5b85f7b..17124d2 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java +++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java @@ -18,6 +18,8 @@ package org.apache.slider.providers; +import org.apache.slider.api.ResourceKeys; + /** * Provider role and key for use in app requests. * @@ -28,15 +30,17 @@ public final class ProviderRole { public final String name; public final int id; public final int placementPolicy; + public final int nodeFailureThreshold; public ProviderRole(String name, int id) { - this(name, id, PlacementPolicy.DEFAULT); + this(name, id, PlacementPolicy.DEFAULT, ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD); } - public ProviderRole(String name, int id, int policy) { + public ProviderRole(String name, int id, int policy, int nodeFailureThreshold) { this.name = name; this.id = id; this.placementPolicy = policy; + this.nodeFailureThreshold = nodeFailureThreshold; } @Override @@ -59,10 +63,11 @@ public final class ProviderRole { @Override public String toString() { - return "ProviderRole{" + + return "ProviderRole {" + "name='" + name + '\'' + ", id=" + id + ", policy=" + placementPolicy + + ", nodeFailureThreshold=" + nodeFailureThreshold + '}'; } } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java index b790713..1666c84 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java +++ b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java @@ -88,7 +88,8 @@ public class SliderAMClientProvider extends AbstractClientProvider public static final ProviderRole APPMASTER = new ProviderRole(COMPONENT_AM, KEY_AM, - PlacementPolicy.EXCLUDE_FROM_FLEXING); + PlacementPolicy.EXCLUDE_FROM_FLEXING, + ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD); /** * Initialize role list http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index 2629a4d..b49366e 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -945,7 +945,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService * Creates and starts the web application, and adds a * <code>WebAppService</code> service under the AM, to ensure * a managed web application shutdown. - * @param serviceConf AM configuration * @param port port to deploy the web application on * @param webAppApi web app API instance */ http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java index 95a7ca5..61b0cd6 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java @@ -276,6 +276,7 @@ public class AppState { private long startTimeThreshold; private int failureThreshold = 10; + private int nodeFailureThreshold = 3; private String logServerURL = ""; @@ -559,6 +560,9 @@ public class AppState { failureThreshold = globalResOpts.getOptionInt( ResourceKeys.CONTAINER_FAILURE_THRESHOLD, ResourceKeys.DEFAULT_CONTAINER_FAILURE_THRESHOLD); + nodeFailureThreshold = globalResOpts.getOptionInt( + ResourceKeys.NODE_FAILURE_THRESHOLD, + ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD); initClusterStatus(); @@ -625,7 +629,8 @@ public class AppState { int placement = SliderUtils.parseAndValidate("value of " + name + " " + ResourceKeys.COMPONENT_PLACEMENT_POLICY, placementOpt, 0, 0, -1); - ProviderRole newRole = new ProviderRole(name, priority, placement); + ProviderRole newRole = new ProviderRole(name, priority, placement, + getNodeFailureThresholdForRole(name)); log.info("New {} ", newRole); return newRole; } @@ -1376,6 +1381,7 @@ public class AppState { if (started > 0) { long duration = time - started; shortlived = duration < (startTimeThreshold * 1000); + log.info("Duration {} and startTimeThreshold {}", duration, startTimeThreshold); } else { // never even saw a start event shortlived = true; @@ -1704,7 +1710,21 @@ public class AppState { ResourceKeys.CONTAINER_FAILURE_THRESHOLD, failureThreshold); } - + + /** + * Get the node failure threshold for a specific role, falling back to + * the global one if not + * @param roleName role name + * @return the threshold for failures + */ + private int getNodeFailureThresholdForRole(String roleName) { + ConfTreeOperations resources = + instanceDefinition.getResourceOperations(); + return resources.getComponentOptInt(roleName, + ResourceKeys.NODE_FAILURE_THRESHOLD, + nodeFailureThreshold); + } + /** * Reset the failure counts of all roles */ @@ -1712,7 +1732,7 @@ public class AppState { for (RoleStatus roleStatus : getRoleStatusMap().values()) { int failed = roleStatus.resetFailed(); log.info("Resetting failure count of {}; was {}", - roleStatus.getName(), + roleStatus.getName(), failed); } roleHistory.resetFailedRecently(); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java index d6022e0..6acac89 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java @@ -100,12 +100,26 @@ public final class OutstandingRequest { * @param labelExpression label to satisfy * @return the request to raise */ - public AMRMClient.ContainerRequest buildContainerRequest(Resource resource, - RoleStatus role, long time, String labelExpression) { + public AMRMClient.ContainerRequest buildContainerRequest( + Resource resource, RoleStatus role, long time, String labelExpression) { String[] hosts; boolean relaxLocality; requestedTime = time; - if (node != null) { + boolean usePlacementHistory = role.isStrictPlacement(); + if (!usePlacementHistory) { + // If strict placement does not mandate using placement then check + // that the recent failures on this node is not higher than threshold + if (node != null) { + int numFailuresOnLastHost = node.get(role.getKey()).getFailedRecently(); + usePlacementHistory = numFailuresOnLastHost <= role.getNodeFailureThreshold(); + if(!usePlacementHistory) { + log.info("Recent node failures {} is higher than threshold {}. Dropping host {} from preference.", + numFailuresOnLastHost, role.getNodeFailureThreshold(), node.hostname); + } + } + } + + if (node != null && usePlacementHistory) { hosts = new String[1]; hosts[0] = node.hostname; relaxLocality = !role.isStrictPlacement(); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java index 605a4f8..e94457a 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java @@ -443,7 +443,7 @@ public class RoleHistory { /** * Get the nodes for an ID -may be null * @param id role ID - * @return potenially null list + * @return potentially null list */ @VisibleForTesting public List<NodeInstance> getNodesForRoleId(int id) { @@ -755,7 +755,7 @@ public class RoleHistory { boolean wasReleased, boolean shortLived) { NodeEntry nodeEntry = getOrCreateNodeEntry(container); - log.debug("Finished container for node {}, released={}, shortlived={}", + log.info("Finished container for node {}, released={}, shortlived={}", nodeEntry.rolePriority, wasReleased, shortLived); boolean available; if (shortLived) { http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java index 3edc5f1..22c5164 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java @@ -76,6 +76,15 @@ public final class RoleStatus implements Cloneable { return providerRole.placementPolicy; } + /** + * The number of failures on a specific node that can be tolerated + * before selecting a different node for placement + * @return + */ + public int getNodeFailureThreshold() { + return providerRole.nodeFailureThreshold; + } + public boolean getExcludeFromFlexing() { return 0 != (getPlacementPolicy() & PlacementPolicy.EXCLUDE_FROM_FLEXING); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy index 83fb273..13ecf13 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy @@ -84,6 +84,8 @@ class TestMockAppStateDynamicRoles extends BaseMockAppStateTest (ResourceKeys.COMPONENT_INSTANCES): "1", (ResourceKeys.COMPONENT_PLACEMENT_POLICY): Integer.toString(PlacementPolicy.STRICT), + (ResourceKeys.NODE_FAILURE_THRESHOLD): + Integer.toString(2), ] instance.resourceOperations.components[ROLE5]= opts5 @@ -148,6 +150,13 @@ class TestMockAppStateDynamicRoles extends BaseMockAppStateTest } @Test + public void testNodeFailureThresholdPropagation() throws Throwable { + assert (appState.lookupRoleStatus(ROLE4).nodeFailureThreshold == 3) + assert (appState.lookupRoleStatus(ROLE5).nodeFailureThreshold == 2) + + } + + @Test public void testLaxPlacementSecondRequestRole4() throws Throwable { log.info("Initial engine state = $engine") def role4 = appState.lookupRoleStatus(ROLE4) http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy index 8f577e5..d87222d 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy @@ -82,7 +82,34 @@ class TestRoleHistoryRequestTracking extends BaseMockAppStateTest { List<NodeInstance> a2 = roleHistory.cloneAvailableList(0) assertListEquals([age2Active0], a2) } - + + @Test + public void testRequestedNodeOffListWithFailures() throws Throwable { + NodeInstance ni = roleHistory.findNodeForNewInstance(roleStatus) + assert age3Active0 == ni + AMRMClient.ContainerRequest req = roleHistory.requestInstanceOnNode(ni, + roleStatus, + resource, + "") + assert 1 == req.nodes.size() + List<NodeInstance> a2 = roleHistory.cloneAvailableList(0) + assertListEquals([age2Active0], a2) + + age3Active0.get(0).failedRecently = 4 + req = roleHistory.requestInstanceOnNode(ni, + roleStatus, + resource, + "") + assertNull(req.nodes) + + age3Active0.get(0).failedRecently = 0 + req = roleHistory.requestInstanceOnNode(ni, + roleStatus, + resource, + "") + assert 1 == req.nodes.size() + } + @Test public void testFindAndRequestNode() throws Throwable { AMRMClient.ContainerRequest req = roleHistory.requestNode(roleStatus, resource)