Repository: ambari Updated Branches: refs/heads/branch-2.4 9f124b8a4 -> 5fc37db51
AMBARI-16646: Set vm.overcommit_memory dynamically for HAWQ (mithmatt) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/5fc37db5 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/5fc37db5 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/5fc37db5 Branch: refs/heads/branch-2.4 Commit: 5fc37db5142bab3ff8298be0fe4653076362c0b7 Parents: 9f124b8 Author: Matt <mmat...@pivotal.io> Authored: Fri May 13 15:09:52 2016 -0700 Committer: Matt <mmat...@pivotal.io> Committed: Fri May 13 15:09:52 2016 -0700 ---------------------------------------------------------------------- .../HAWQ/2.0.0/service_advisor.py | 28 +++-- .../main/resources/stacks/service_advisor.py | 3 +- .../stacks/2.3/HAWQ/test_service_advisor.py | 123 +++++++++++++++++++ .../stacks/2.3/common/test_stack_advisor.py | 39 +++++- 4 files changed, 180 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/5fc37db5/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/service_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/service_advisor.py b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/service_advisor.py index a26a398..276dd3a 100644 --- a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/service_advisor.py +++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/service_advisor.py @@ -120,11 +120,15 @@ class HAWQ200ServiceAdvisor(service_advisor.ServiceAdvisor): # Set dfs.allow.truncate to true putHdfsSiteProperty('dfs.allow.truncate', 'true') - if any(x in services["configurations"] for x in ["hawq-site", "hdfs-client"]): + if any(x in services["configurations"] for x in ["hawq-site", "hdfs-client", "hawq-sysctl-env"]): componentsListList = [service["components"] for service in services["services"]] componentsList = [item["StackServiceComponents"] for sublist in componentsListList for item in sublist] servicesList = [service["StackServices"]["service_name"] for service in services["services"]] - numSegments = len(self.getHosts(componentsList, "HAWQSEGMENT")) + hawqMasterHosts = set(self.getHosts(componentsList, "HAWQMASTER")).union(set(self.getHosts(componentsList, "HAWQSTANDBY"))) + hawqSegmentHosts = set(self.getHosts(componentsList, "HAWQSEGMENT")) + hawqHosts = hawqMasterHosts.union(hawqSegmentHosts) + numSegments = len(hawqSegmentHosts) + minHawqHostsMemory = min([host['Hosts']['total_mem'] for host in hosts['items'] if host['Hosts']['host_name'] in hawqHosts]) if "hawq-site" in services["configurations"]: hawq_site = services["configurations"]["hawq-site"]["properties"] @@ -157,11 +161,21 @@ class HAWQ200ServiceAdvisor(service_advisor.ServiceAdvisor): if hs_prop in hawq_site and ys_prop in yarn_site: putHawqSiteProperty(hs_prop, yarn_site[ys_prop]) + # set vm.overcommit_memory to 2 if the minimum memory among all hawqHosts is greater than 32GB + if "hawq-sysctl-env" in services["configurations"]: + MEM_THRESHOLD = 33554432 # 32GB, minHawqHostsMemory is represented in kB + hawq_sysctl_env = services["configurations"]["hawq-sysctl-env"]["properties"] + if "vm.overcommit_memory" in hawq_sysctl_env: + propertyValue = "2" if minHawqHostsMemory >= MEM_THRESHOLD else "1" + putHawqSysctlEnvProperty = self.putProperty(configurations, "hawq-sysctl-env", services) + putHawqSysctlEnvProperty("vm.overcommit_memory", propertyValue) + # set output.replace-datanode-on-failure in HAWQ hdfs-client depending on the cluster size if "hdfs-client" in services["configurations"]: + MIN_NUM_SEGMENT_THRESHOLD = 3 hdfs_client = services["configurations"]["hdfs-client"]["properties"] if "output.replace-datanode-on-failure" in hdfs_client: - propertyValue = "true" if numSegments > 3 else "false" + propertyValue = "true" if numSegments > MIN_NUM_SEGMENT_THRESHOLD else "false" putHdfsClientProperty = self.putProperty(configurations, "hdfs-client", services) putHdfsClientProperty("output.replace-datanode-on-failure", propertyValue) @@ -272,13 +286,13 @@ class HAWQ200ServiceAdvisor(service_advisor.ServiceAdvisor): numSegments = len(self.getHosts(componentsList, "HAWQSEGMENT")) message = None - limit = 3 - if numSegments > limit and value != 'TRUE': + MIN_NUM_SEGMENT_THRESHOLD = 3 + if numSegments > MIN_NUM_SEGMENT_THRESHOLD and value != 'TRUE': message = "{0} should be set to true (checked) for clusters with more than {1} HAWQ Segments" - elif numSegments <= limit and value != 'FALSE': + elif numSegments <= MIN_NUM_SEGMENT_THRESHOLD and value != 'FALSE': message = "{0} should be set to false (unchecked) for clusters with {1} or less HAWQ Segments" if message: - validationItems.append({"config-name": PROP_NAME, "item": self.getWarnItem(message.format(PROP_NAME, str(limit)))}) + validationItems.append({"config-name": PROP_NAME, "item": self.getWarnItem(message.format(PROP_NAME, str(MIN_NUM_SEGMENT_THRESHOLD)))}) return stackAdvisor.toConfigurationValidationProblems(validationItems, "hdfs-client") http://git-wip-us.apache.org/repos/asf/ambari/blob/5fc37db5/ambari-server/src/main/resources/stacks/service_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/service_advisor.py b/ambari-server/src/main/resources/stacks/service_advisor.py index 86682c5..3d6c293 100644 --- a/ambari-server/src/main/resources/stacks/service_advisor.py +++ b/ambari-server/src/main/resources/stacks/service_advisor.py @@ -172,7 +172,8 @@ class ServiceAdvisor(object): Returns the hosts which are running the given component. """ def getHosts(self, componentsList, componentName): - return [component["hostnames"] for component in componentsList if component["component_name"] == componentName][0] + hostNamesList = [component["hostnames"] for component in componentsList if component["component_name"] == componentName] + return hostNamesList[0] if len(hostNamesList) > 0 else [] """ Utility method for setting a configuration property value. http://git-wip-us.apache.org/repos/asf/ambari/blob/5fc37db5/ambari-server/src/test/python/stacks/2.3/HAWQ/test_service_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.3/HAWQ/test_service_advisor.py b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_service_advisor.py new file mode 100644 index 0000000..50f3a1f --- /dev/null +++ b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_service_advisor.py @@ -0,0 +1,123 @@ +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +from unittest import TestCase + + +class TestHAWQ200ServiceAdvisor(TestCase): + + def setUp(self): + import imp + self.testDirectory = os.path.dirname(os.path.abspath(__file__)) + stackAdvisorPath = os.path.join(self.testDirectory, '../../../../../main/resources/stacks/stack_advisor.py') + hawq200ServiceAdvisorPath = os.path.join(self.testDirectory, '../../../../../main/resources/common-services/HAWQ/2.0.0/service_advisor.py') + + with open(stackAdvisorPath, 'rb') as fp: + stack_advisor = imp.load_module('stack_advisor', fp, stackAdvisorPath, ('.py', 'rb', imp.PY_SOURCE)) + with open(hawq200ServiceAdvisorPath, 'rb') as fp: + service_advisor = imp.load_module('stack_advisor_impl', fp, hawq200ServiceAdvisorPath, ('.py', 'rb', imp.PY_SOURCE)) + + stackAdvisorClass = getattr(stack_advisor, 'StackAdvisor') + self.stackAdvisor = stackAdvisorClass() + + serviceAdvisorClass = getattr(service_advisor, 'HAWQ200ServiceAdvisor') + self.serviceAdvisor = serviceAdvisorClass() + + def test_getServiceConfigurationRecommendations(self): + + configurations = { + "hawq-sysctl-env": { + "properties": { + "vm.overcommit_memory": "1" + } + } + } + + services = { + "services": [ + { + "StackServices": { + "service_name": "HAWQ", + "service_version": "2.0", + "stack_name": "HDP", + "stack_version": "2.3" + }, + "components": [ + { + "StackServiceComponents": { + "component_name": "HAWQMASTER", + "hostnames": [ + "c6401.ambari.apache.org" + ] + } + }, + { + "StackServiceComponents": { + "component_name": "HAWQSEGMENT", + "hostnames": [ + "c6402.ambari.apache.org", + "c6404.ambari.apache.org", + ] + } + } + ] + } + ], + "configurations": configurations + } + + hosts = { + "items": [ + { + "Hosts": { + "host_name": "c6401.ambari.apache.org", + "total_mem": 33554432 + } + }, + { + "Hosts": { + "host_name": "c6402.ambari.apache.org", + "total_mem": 33554433 + } + }, + { + "Hosts": { + "host_name": "c6403.ambari.apache.org", + "total_mem": 33554434 + } + }, + { + "Hosts": { + "host_name": "c6404.ambari.apache.org", + "total_mem": 33554435 + } + } + ] + } + + ## Test if vm.overcommit_memory is set correctly + + # Case 1: All machines have total_mem above 32GB (total_mem >= 33554432) + self.serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, None, services, hosts) + self.assertEquals(configurations["hawq-sysctl-env"]["properties"]["vm.overcommit_memory"], "2") + + # Case 2: One machine has total_mem below 32GB + hosts["items"][0]["Hosts"]["total_mem"] = 33554431 + self.serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, None, services, hosts) + self.assertEquals(configurations["hawq-sysctl-env"]["properties"]["vm.overcommit_memory"], "1") \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/5fc37db5/ambari-server/src/test/python/stacks/2.3/common/test_stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.3/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.3/common/test_stack_advisor.py index be0f3e4..6981205 100644 --- a/ambari-server/src/test/python/stacks/2.3/common/test_stack_advisor.py +++ b/ambari-server/src/test/python/stacks/2.3/common/test_stack_advisor.py @@ -1989,6 +1989,35 @@ class TestHDP23StackAdvisor(TestCase): def test_recommendHAWQConfigurations(self): + hosts = { + "items": [ + { + "Hosts": { + "host_name": "c6401.ambari.apache.org", + "total_mem": 12345678 + } + }, + { + "Hosts": { + "host_name": "c6402.ambari.apache.org", + "total_mem": 12345678 + } + }, + { + "Hosts": { + "host_name": "c6403.ambari.apache.org", + "total_mem": 12345678 + } + }, + { + "Hosts": { + "host_name": "c6404.ambari.apache.org", + "total_mem": 12345678 + } + } + ] + } + # original cluster data with 3 segments services = self.load_json("services-normal-hawq-3-hosts.json") componentsListList = [service["components"] for service in services["services"]] @@ -2012,7 +2041,7 @@ class TestHDP23StackAdvisor(TestCase): # Test 1 - with 3 segments self.assertEquals(len(hawqSegmentComponent["hostnames"]), 3) serviceAdvisor = self.createHAWQServiceAdvisor() - serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, None) + serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, hosts) self.assertEquals(configurations["hawq-site"]["properties"]["default_hash_table_bucket_number"], str(3 * 6)) self.assertEquals(configurations["hdfs-client"]["properties"]["output.replace-datanode-on-failure"], "false") @@ -2022,19 +2051,19 @@ class TestHDP23StackAdvisor(TestCase): # Test 2 - with 100 segments hawqSegmentComponent["hostnames"] = ["host" + str(i) for i in range(100)] - serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, None) + serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, hosts) self.assertEquals(configurations["hawq-site"]["properties"]["default_hash_table_bucket_number"], str(100 * 5)) self.assertEquals(configurations["hdfs-client"]["properties"]["output.replace-datanode-on-failure"], "true") # Test 3 - with 512 segments hawqSegmentComponent["hostnames"] = ["host" + str(i) for i in range(512)] - serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, None) + serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, hosts) self.assertEquals(configurations["hawq-site"]["properties"]["default_hash_table_bucket_number"], "512") self.assertEquals(configurations["hdfs-client"]["properties"]["output.replace-datanode-on-failure"], "true") # Test 4 - with 513 segments hawqSegmentComponent["hostnames"] = ["host" + str(i) for i in range(513)] - serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, None) + serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, hosts) self.assertEquals(configurations["hawq-site"]["properties"]["default_hash_table_bucket_number"], "512") self.assertEquals(configurations["hdfs-client"]["properties"]["output.replace-datanode-on-failure"], "true") @@ -2042,7 +2071,7 @@ class TestHDP23StackAdvisor(TestCase): configurations = {} services["configurations"]["hawq-site"] = {"properties":{'hawq-site': {'properties': {}}}} hawqSegmentComponent["hostnames"] = [] - serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, None) + serviceAdvisor.getServiceConfigurationRecommendations(self.stackAdvisor, configurations, clusterData, services, hosts) self.assertEquals(configurations, {'hdfs-client': {'properties': {'output.replace-datanode-on-failure': 'false'}}, 'hawq-site': {'properties': {}}, 'hdfs-site': {'properties': {'dfs.allow.truncate': 'true'}}})