Repository: ambari Updated Branches: refs/heads/trunk e55523012 -> 6a8115572
AMBARI-19690: NM Memory can end up being too high on nodes with many components (jluniya) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/6a811557 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/6a811557 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/6a811557 Branch: refs/heads/trunk Commit: 6a8115572b328785532aed27c1dc44a1bac17a01 Parents: e555230 Author: Jayush Luniya <jlun...@hortonworks.com> Authored: Wed Jan 25 09:40:56 2017 -0800 Committer: Jayush Luniya <jlun...@hortonworks.com> Committed: Wed Jan 25 09:40:56 2017 -0800 ---------------------------------------------------------------------- .../stacks/HDP/2.0.6/services/stack_advisor.py | 60 +++++++++- .../stacks/HDP/2.5/services/stack_advisor.py | 33 +----- .../src/main/resources/stacks/stack_advisor.py | 18 +++ .../stacks/2.0.6/common/test_stack_advisor.py | 113 ++++++++++++++++++- .../stacks/2.5/common/test_stack_advisor.py | 4 +- 5 files changed, 191 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/6a811557/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py index 7ed1b77..55f3d30 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py @@ -1350,6 +1350,35 @@ class HDP206StackAdvisor(DefaultStackAdvisor): totalMemoryRequired += self.formatXmxSizeToBytes(heapsize) return totalMemoryRequired + def get_yarn_nm_mem_in_mb(self, services, configurations): + """ + Gets YARN NodeManager memory in MB (yarn.nodemanager.resource.memory-mb). + Reads from: + - configurations (if changed as part of current Stack Advisor invocation (output)), and services["changed-configurations"] + is empty, else + - services['configurations'] (input). + + services["changed-configurations"] would be empty is Stack Advisor call if made from Blueprints (1st invocation). Subsequent + Stack Advisor calls will have it non-empty. We do this because in subsequent invocations, even if Stack Advsior calculates this + value (configurations), it is finally not recommended, making 'input' value to survive. + """ + yarn_nm_mem_in_mb = None + + yarn_site = getServicesSiteProperties(services, "yarn-site") + yarn_site_properties = getSiteProperties(configurations, "yarn-site") + + # Check if services["changed-configurations"] is empty and 'yarn.nodemanager.resource.memory-mb' is modified in current ST invocation. + if not ("changed-configurations" in services and services["changed-configurations"]) and yarn_site_properties and 'yarn.nodemanager.resource.memory-mb' in yarn_site_properties: + yarn_nm_mem_in_mb = float(yarn_site_properties['yarn.nodemanager.resource.memory-mb']) + elif yarn_site and 'yarn.nodemanager.resource.memory-mb' in yarn_site: + # Check if 'yarn.nodemanager.resource.memory-mb' is input in services array. + yarn_nm_mem_in_mb = float(yarn_site['yarn.nodemanager.resource.memory-mb']) + + if yarn_nm_mem_in_mb <= 0.0: + Logger.warning("'yarn.nodemanager.resource.memory-mb' current value : {0}. Expected value : > 0".format(yarn_nm_mem_in_mb)) + + return yarn_nm_mem_in_mb + def getPreferredMountPoints(self, hostInfo): # '/etc/resolv.conf', '/etc/hostname', '/etc/hosts' are docker specific mount points @@ -1438,10 +1467,37 @@ class HDP206StackAdvisor(DefaultStackAdvisor): def validateYARNConfigurations(self, properties, recommendedDefaults, configurations, services, hosts): clusterEnv = getSiteProperties(configurations, "cluster-env") - validationItems = [ {"config-name": 'yarn.nodemanager.resource.memory-mb', "item": self.validatorLessThenDefaultValue(properties, recommendedDefaults, 'yarn.nodemanager.resource.memory-mb')}, + + validationItems = [ {"config-name": 'yarn.nodemanager.resource.memory-mb', "item": self.validatorGreaterThenDefaultValue(properties, recommendedDefaults, 'yarn.nodemanager.resource.memory-mb')}, {"config-name": 'yarn.scheduler.minimum-allocation-mb', "item": self.validatorLessThenDefaultValue(properties, recommendedDefaults, 'yarn.scheduler.minimum-allocation-mb')}, {"config-name": 'yarn.nodemanager.linux-container-executor.group', "item": self.validatorEqualsPropertyItem(properties, "yarn.nodemanager.linux-container-executor.group", clusterEnv, "user_group")}, - {"config-name": 'yarn.scheduler.maximum-allocation-mb', "item": self.validatorLessThenDefaultValue(properties, recommendedDefaults, 'yarn.scheduler.maximum-allocation-mb')} ] + {"config-name": 'yarn.scheduler.maximum-allocation-mb', "item": self.validatorGreaterThenDefaultValue(properties, recommendedDefaults, 'yarn.scheduler.maximum-allocation-mb')} ] + nmMemory = int(self.get_yarn_nm_mem_in_mb(services, configurations)) + if "items" in hosts and len(hosts["items"]) > 0: + nodeManagerHosts = self.getHostsWithComponent("YARN", "NODEMANAGER", services, hosts) + nmLowMemoryHosts = [] + # NodeManager host with least memory is generally used in calculations as it will work in larger hosts. + if nodeManagerHosts is not None and len(nodeManagerHosts) > 0: + for nmHost in nodeManagerHosts: + nmHostName = nmHost["Hosts"]["host_name"] + componentNames = [] + for service in services["services"]: + for component in service["components"]: + if not self.isClientComponent(component) and component["StackServiceComponents"]["hostnames"] is not None: + if nmHostName in component["StackServiceComponents"]["hostnames"]: + componentNames.append(component["StackServiceComponents"]["component_name"]) + requiredMemory = self.getMemorySizeRequired(services, componentNames, configurations) + unusedMemory = int((nmHost["Hosts"]["total_mem"] * 1024 - requiredMemory)/ (1024 * 1024)) # in MB + if nmMemory > unusedMemory: + nmLowMemoryHosts.append(nmHostName) + + if len(nmLowMemoryHosts) > 0: + validationItems.append({"config-name": "yarn.nodemanager.resource.memory-mb", + "item": self.getWarnItem( + "Node manager hosts with high memory usage found (examples : {0}). Consider reducing the allocated " + "memory for containers or moving other co-located components " + "to a different host.".format(",".join(nmLowMemoryHosts[:3])))}) + return self.toConfigurationValidationProblems(validationItems, "yarn-site") def validateYARNEnvConfigurations(self, properties, recommendedDefaults, configurations, services, hosts): http://git-wip-us.apache.org/repos/asf/ambari/blob/6a811557/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py index d2c0459..17f0c59 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py @@ -109,7 +109,7 @@ class HDP25StackAdvisor(HDP24StackAdvisor): "ATLAS": {"application-properties": self.validateAtlasConfigurations}, "HIVE": {"hive-interactive-env": self.validateHiveInteractiveEnvConfigurations, "hive-interactive-site": self.validateHiveInteractiveSiteConfigurations}, - "YARN": {"yarn-site": self.validateYarnConfigurations}, + "YARN": {"yarn-site": self.validateYARNConfigurations}, "RANGER": {"ranger-tagsync-site": self.validateRangerTagsyncConfigurations}, "SPARK2": {"spark2-defaults": self.validateSpark2Defaults, "spark2-thrift-sparkconf": self.validateSpark2ThriftSparkConf}, @@ -247,7 +247,7 @@ class HDP25StackAdvisor(HDP24StackAdvisor): ] return self.toConfigurationValidationProblems(validationItems, "spark2-thrift-sparkconf") - def validateYarnConfigurations(self, properties, recommendedDefaults, configurations, services, hosts): + def validateYARNConfigurations(self, properties, recommendedDefaults, configurations, services, hosts): parentValidationProblems = super(HDP25StackAdvisor, self).validateYARNConfigurations(properties, recommendedDefaults, configurations, services, hosts) yarn_site_properties = self.getSiteProperties(configurations, "yarn-site") servicesList = [service["StackServices"]["service_name"] for service in services["services"]] @@ -1361,35 +1361,6 @@ class HDP25StackAdvisor(HDP24StackAdvisor): if yarn_min_container_size < 256: return 256 - def get_yarn_nm_mem_in_mb(self, services, configurations): - """ - Gets YARN NodeManager memory in MB (yarn.nodemanager.resource.memory-mb). - Reads from: - - configurations (if changed as part of current Stack Advisor invocation (output)), and services["changed-configurations"] - is empty, else - - services['configurations'] (input). - - services["changed-configurations"] would be empty is Stack Advisor call if made from Blueprints (1st invocation). Subsequent - Stack Advisor calls will have it non-empty. We do this because in subsequent invocations, even if Stack Advsior calculates this - value (configurations), it is finally not recommended, making 'input' value to survive. - """ - yarn_nm_mem_in_mb = None - - yarn_site = self.getServicesSiteProperties(services, "yarn-site") - yarn_site_properties = self.getSiteProperties(configurations, "yarn-site") - - # Check if services["changed-configurations"] is empty and 'yarn.nodemanager.resource.memory-mb' is modified in current ST invocation. - if not services["changed-configurations"] and yarn_site_properties and 'yarn.nodemanager.resource.memory-mb' in yarn_site_properties: - yarn_nm_mem_in_mb = float(yarn_site_properties['yarn.nodemanager.resource.memory-mb']) - elif yarn_site and 'yarn.nodemanager.resource.memory-mb' in yarn_site: - # Check if 'yarn.nodemanager.resource.memory-mb' is input in services array. - yarn_nm_mem_in_mb = float(yarn_site['yarn.nodemanager.resource.memory-mb']) - - if yarn_nm_mem_in_mb <= 0.0: - Logger.warning("'yarn.nodemanager.resource.memory-mb' current value : {0}. Expected value : > 0".format(yarn_nm_mem_in_mb)) - - return yarn_nm_mem_in_mb - def calculate_tez_am_container_size(self, services, total_cluster_capacity): """ Calculates Tez App Master container size (tez.am.resource.memory.mb) for tez_hive2/tez-site on initialization if values read is 0. http://git-wip-us.apache.org/repos/asf/ambari/blob/6a811557/ambari-server/src/main/resources/stacks/stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/stack_advisor.py b/ambari-server/src/main/resources/stacks/stack_advisor.py index ad3b510..6fb014e 100644 --- a/ambari-server/src/main/resources/stacks/stack_advisor.py +++ b/ambari-server/src/main/resources/stacks/stack_advisor.py @@ -2151,6 +2151,24 @@ class DefaultStackAdvisor(StackAdvisor): return self.getWarnItem("Value is less than the recommended default of {0}".format(defaultValue)) return None + def validatorGreaterThenDefaultValue(self, properties, recommendedDefaults, propertyName): + if propertyName not in recommendedDefaults: + # If a property name exists in say hbase-env and hbase-site (which is allowed), then it will exist in the + # "properties" dictionary, but not necessarily in the "recommendedDefaults" dictionary". In this case, ignore it. + return None + + if not propertyName in properties: + return self.getErrorItem("Value should be set") + value = self.to_number(properties[propertyName]) + if value is None: + return self.getErrorItem("Value should be integer") + defaultValue = self.to_number(recommendedDefaults[propertyName]) + if defaultValue is None: + return None + if value > defaultValue: + return self.getWarnItem("Value is greater than the recommended default of {0}".format(defaultValue)) + return None + def validatorEqualsPropertyItem(self, properties1, propertyName1, properties2, propertyName2, emptyAllowed=False): http://git-wip-us.apache.org/repos/asf/ambari/blob/6a811557/ambari-server/src/test/python/stacks/2.0.6/common/test_stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.0.6/common/test_stack_advisor.py index ff25512..a6931c5 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/common/test_stack_advisor.py +++ b/ambari-server/src/test/python/stacks/2.0.6/common/test_stack_advisor.py @@ -776,7 +776,7 @@ class TestHDP206StackAdvisor(TestCase): # Test - Cluster data with 2 hosts - pick minimum memory servicesList.append("YARN") - services = services = {"services": + services = {"services": [{"StackServices": {"service_name" : "YARN", "service_version" : "2.6.0.2.2" @@ -3401,7 +3401,41 @@ class TestHDP206StackAdvisor(TestCase): } } } - services = {'configurations': {} } + + services = {"services": + [{"StackServices": + {"service_name" : "YARN", + "service_version" : "2.6.0.2.2" + }, + "components":[ + { + "StackServiceComponents":{ + "advertise_version":"true", + "cardinality":"1+", + "component_category":"SLAVE", + "component_name":"NODEMANAGER", + "custom_commands":[ + + ], + "display_name":"NodeManager", + "is_client":"false", + "is_master":"false", + "service_name":"YARN", + "stack_name":"HDP", + "stack_version":"2.2", + "hostnames":[ + "host1", + "host2" + ] + }, + "dependencies":[ + ] + } + ], + }], + "configurations": {} + } + recommendedDefaults = {'yarn.nodemanager.resource.memory-mb' : '12288', 'yarn.scheduler.minimum-allocation-mb' : '3072', 'yarn.nodemanager.linux-container-executor.group': 'hadoop', @@ -3413,3 +3447,78 @@ class TestHDP206StackAdvisor(TestCase): res = self.stackAdvisor.validateYARNConfigurations(properties, recommendedDefaults, configurations, services, {}) self.assertFalse(res) + + hosts = { + "items" : [ + { + "Hosts" : { + "host_name" : "host1", + "cpu_count" : 2, + "total_mem" : 12582912, + "disk_info" : [ + { + "available" : "21052800", + "device" : "/dev/vda1", + "used" : "3303636", + "percent" : "14%", + "size" : "25666616", + "type" : "ext4", + "mountpoint" : "/" + }, + { + "available" : "244732200", + "device" : "/dev/vdb", + "used" : "60508", + "percent" : "1%", + "size" : "257899908", + "type" : "ext4", + "mountpoint" : "/grid/0" + } + ] + } + } + ] + } + # Cluster RAM = 12 GB (12582912 KB) + # YARN NodeManager HeapSize = 1024 MB (default) + # Max Container Allocation = 11264 MB ( user set to 12288) + expectedItems = [ + { + 'config-type': 'yarn-site', + 'message': 'Node manager hosts with high memory usage found (examples : host1). ' + 'Consider reducing the allocated memory for containers or ' + 'moving other co-located components to a different host.', + 'type': 'configuration', + 'config-name': 'yarn.nodemanager.resource.memory-mb', + 'level': 'WARN' + } + ] + items = self.stackAdvisor.validateYARNConfigurations(properties, recommendedDefaults, configurations, services, hosts) + self.assertEquals(expectedItems, items) + + + recommendedDefaults = {'yarn.nodemanager.resource.memory-mb' : '10240', + 'yarn.scheduler.minimum-allocation-mb' : '3072', + 'yarn.nodemanager.linux-container-executor.group': 'hadoop', + 'yarn.scheduler.maximum-allocation-mb': '10240'} + + expectedItems = [ + { + 'config-type': 'yarn-site', + 'message': 'Value is greater than the recommended default of 10240', + 'type': 'configuration', + 'config-name': 'yarn.nodemanager.resource.memory-mb', + 'level': 'WARN' + }, + { + 'config-type': 'yarn-site', + 'message': 'Value is greater than the recommended default of 10240', + 'type': 'configuration', + 'config-name': 'yarn.scheduler.maximum-allocation-mb', + 'level': 'WARN' + } + ] + + items = self.stackAdvisor.validateYARNConfigurations(properties, recommendedDefaults, configurations, services, {}) + self.assertEquals(expectedItems, items) + http://git-wip-us.apache.org/repos/asf/ambari/blob/6a811557/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py index ad962fd..a53cb25 100644 --- a/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py +++ b/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py @@ -444,7 +444,7 @@ class TestHDP25StackAdvisor(TestCase): self.assertEquals(validations[0], expected) - def test_validateYarnConfigurations(self): + def test_validateYARNConfigurations(self): properties = {'enable_hive_interactive': 'true', 'hive.tez.container.size': '2048', "yarn.nodemanager.linux-container-executor.group": "hadoop"} recommendedDefaults = {'enable_hive_interactive': 'true', @@ -473,7 +473,7 @@ class TestHDP25StackAdvisor(TestCase): res_expected = [ {'config-type': 'yarn-site', 'message': 'While enabling HIVE_SERVER_INTERACTIVE it is recommended that you enable work preserving restart in YARN.', 'type': 'configuration', 'config-name': 'yarn.resourcemanager.work-preserving-recovery.enabled', 'level': 'WARN'} ] - res = self.stackAdvisor.validateYarnConfigurations(properties, recommendedDefaults, configurations, services, {}) + res = self.stackAdvisor.validateYARNConfigurations(properties, recommendedDefaults, configurations, services, {}) self.assertEquals(res, res_expected) pass