Repository: ambari Updated Branches: refs/heads/branch-2.5 da2aa3e6e -> 3e2539d3e
AMBARI-20895. Fixing sizing for Hive-interactive-site's Tez AM's (sseth via Swapan Shridhar). Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/3e2539d3 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/3e2539d3 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/3e2539d3 Branch: refs/heads/branch-2.5 Commit: 3e2539d3e45c3eedd817f051881a39da724428b0 Parents: da2aa3e Author: Swapan Shridhar <sshrid...@hortonworks.com> Authored: Fri Apr 28 14:58:52 2017 -0700 Committer: Swapan Shridhar <sshrid...@hortonworks.com> Committed: Fri Apr 28 16:32:05 2017 -0700 ---------------------------------------------------------------------- .../stacks/HDP/2.5/services/stack_advisor.py | 29 ++ .../stacks/2.5/common/test_stack_advisor.py | 408 +++++++++++++++++++ 2 files changed, 437 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/3e2539d3/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py index b6aca4c..638e79a 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.5/services/stack_advisor.py @@ -1152,7 +1152,9 @@ class HDP25StackAdvisor(HDP24StackAdvisor): Logger.info("DBG: Calculated 'llap_mem_daemon_size' : {0}, using following : llap_mem_for_tezAm_and_daemons : {1}, tez_am_memory_required : " "{2}".format(llap_mem_daemon_size, llap_mem_for_tezAm_and_daemons, tez_am_memory_required)) + llap_daemon_mem_per_node = self._normalizeDown(llap_mem_daemon_size / num_llap_nodes_requested, yarn_min_container_size) + # This value takes into account total cluster capacity, and may not have left enough capcaity on each node to launch an AM. Logger.info("DBG: Calculated 'llap_daemon_mem_per_node' : {0}, using following : llap_mem_daemon_size : {1}, num_llap_nodes_requested : {2}, " "yarn_min_container_size: {3}".format(llap_daemon_mem_per_node, llap_mem_daemon_size, num_llap_nodes_requested, yarn_min_container_size)) if llap_daemon_mem_per_node == 0: @@ -1172,6 +1174,31 @@ class HDP25StackAdvisor(HDP24StackAdvisor): num_llap_nodes = num_llap_nodes_requested Logger.info("DBG: num_llap_nodes : {0}".format(num_llap_nodes)) + # Make sure we have enough memory on each node to run AMs. + # If nodes vs nodes_requested is different - AM memory is already factored in. + # If llap_node_count < total_cluster_nodes - assuming AMs can run on a different node. + # Else factor in min_concurrency_per_node * tez_am_size, and slider_am_size + # Also needs to factor in whether num_llap_nodes = cluster_node_count + min_mem_reserved_per_node = 0 + if num_llap_nodes == num_llap_nodes_requested and num_llap_nodes == node_manager_cnt: + min_mem_reserved_per_node = max(normalized_tez_am_container_size, slider_am_container_size) + tez_AMs_per_node = llap_concurrency / num_llap_nodes + tez_AMs_per_node_low = int(math.floor(tez_AMs_per_node)) + tez_AMs_per_node_high = int(math.ceil(tez_AMs_per_node)) + min_mem_reserved_per_node = int(max(tez_AMs_per_node_high * normalized_tez_am_container_size, tez_AMs_per_node_low * normalized_tez_am_container_size + slider_am_container_size)) + Logger.info("DBG: Determined 'AM reservation per node': {0}, using following : concurrency: {1}, num_llap_nodes: {2}, AMsPerNode: {3}" + .format(min_mem_reserved_per_node, llap_concurrency, num_llap_nodes, tez_AMs_per_node)) + + max_single_node_mem_available_for_daemon = self._normalizeDown(yarn_nm_mem_in_mb_normalized - min_mem_reserved_per_node, yarn_min_container_size) + if max_single_node_mem_available_for_daemon <=0 or max_single_node_mem_available_for_daemon < mem_per_thread_for_llap: + Logger.warning("Not enough capacity available per node for daemons after factoring in AM memory requirements. NM Mem: {0}, " + "minAMMemPerNode: {1}, available: {2}".format(yarn_nm_mem_in_mb_normalized, min_mem_reserved_per_node, max_single_node_mem_available_for_daemon)) + self.recommendDefaultLlapConfiguration(configurations, services, hosts) + + llap_daemon_mem_per_node = min(max_single_node_mem_available_for_daemon, llap_daemon_mem_per_node) + Logger.info("DBG: Determined final memPerDaemon: {0}, using following: concurrency: {1}, numNMNodes: {2}, numLlapNodes: {3} " + .format(llap_daemon_mem_per_node, llap_concurrency, node_manager_cnt, num_llap_nodes)) + num_executors_per_node_max = self.get_max_executors_per_node(yarn_nm_mem_in_mb_normalized, cpu_per_nm_host, mem_per_thread_for_llap) if num_executors_per_node_max < 1: Logger.warning("Calculated 'Max. Executors per Node' = {0}. Expected values >= 1.".format(num_executors_per_node_max)) @@ -1192,6 +1219,8 @@ class HDP25StackAdvisor(HDP24StackAdvisor): # Now figure out how much of the memory will be used by the executors, and how much will be used by the cache. total_mem_for_executors_per_node = num_executors_per_node * mem_per_thread_for_llap cache_mem_per_node = llap_daemon_mem_per_node - total_mem_for_executors_per_node + Logger.info("DBG: Calculated 'Cache per node' : {0}, using following : llap_daemon_mem_per_node : {1}, total_mem_for_executors_per_node : {2}" + .format(cache_mem_per_node, llap_daemon_mem_per_node, total_mem_for_executors_per_node)) tez_runtime_io_sort_mb = (long((0.8 * mem_per_thread_for_llap) / 3)) tez_runtime_unordered_output_buffer_size = long(0.8 * 0.075 * mem_per_thread_for_llap) http://git-wip-us.apache.org/repos/asf/ambari/blob/3e2539d3/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py index c9106df..7bc9272 100644 --- a/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py +++ b/ambari-server/src/test/python/stacks/2.5/common/test_stack_advisor.py @@ -140,6 +140,149 @@ class TestHDP25StackAdvisor(TestCase): ] } + + # setup for 'test_recommendYARNConfigurations' + self.hosts_9_total = { + "items": [ + { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6401.ambari.apache.org", + "host_name": "c6401.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6402.ambari.apache.org", + "host_name": "c6402.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6403.ambari.apache.org", + "host_name": "c6403.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6404.ambari.apache.org", + "host_name": "c6404.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6405.ambari.apache.org", + "host_name": "c6405.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6406.ambari.apache.org", + "host_name": "c6406.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6407.ambari.apache.org", + "host_name": "c6407.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6408.ambari.apache.org", + "host_name": "c6408.ambari.apache.org" + }, + }, { + "Hosts": { + "cpu_count": 6, + "total_mem": 50331648, + "disk_info": [ + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"}, + {"mountpoint": "/"}, + {"mountpoint": "/dev/shm"}, + {"mountpoint": "/vagrant"} + ], + "public_host_name": "c6409.ambari.apache.org", + "host_name": "c6409.ambari.apache.org" + }, + } + ] + } + # Expected config outputs. # Expected capacity-scheduler with 'llap' (size:20) and 'default' queue at root level. @@ -3673,6 +3816,271 @@ class TestHDP25StackAdvisor(TestCase): + ####################### 'Nine Node Managers' cluster - tests for calculating llap configs ################ + + + + # Test 16 (1). 'default' and 'llap' (State : RUNNING) queue exists at root level in capacity-scheduler, and + # 'capacity-scheduler' configs are passed-in as dictionary and + # services['configurations']["capacity-scheduler"]["properties"]["capacity-scheduler"] is set to value "null" and + # (2). enable_hive_interactive' is 'on' and (3). configuration change detected for 'hive.server2.tez.sessions.per.default.queue' + # Expected : Configurations values recommended for llap related configs. + def test_recommendYARNConfigurations_nine_node_manager_llap_configs_updated_1(self): + # 9 node managers and yarn.nodemanager.resource.memory-mb": "204800" + services = { + "services": [{ + "StackServices": { + "service_name": "YARN", + }, + "Versions": { + "stack_version": "2.5" + }, + "components": [ + { + "StackServiceComponents": { + "component_name": "NODEMANAGER", + "hostnames": ["c6401.ambari.apache.org", "c6402.ambari.apache.org", "c6403.ambari.apache.org", + "c6404.ambari.apache.org", "c6405.ambari.apache.org", "c6406.ambari.apache.org", + "c6407.ambari.apache.org", "c6408.ambari.apache.org", "c6409.ambari.apache.org"] + } + } + ] + }, { + "href": "/api/v1/stacks/HDP/versions/2.5/services/HIVE", + "StackServices": { + "service_name": "HIVE", + "service_version": "1.2.1.2.5", + "stack_name": "HDP", + "stack_version": "2.5" + }, + "components": [ + { + "href": "/api/v1/stacks/HDP/versions/2.5/services/HIVE/components/HIVE_SERVER_INTERACTIVE", + "StackServiceComponents": { + "advertise_version": "true", + "bulk_commands_display_name": "", + "bulk_commands_master_component_name": "", + "cardinality": "0-1", + "component_category": "MASTER", + "component_name": "HIVE_SERVER_INTERACTIVE", + "custom_commands": ["RESTART_LLAP"], + "decommission_allowed": "false", + "display_name": "HiveServer2 Interactive", + "has_bulk_commands_definition": "false", + "is_client": "false", + "is_master": "true", + "reassign_allowed": "false", + "recovery_enabled": "false", + "service_name": "HIVE", + "stack_name": "HDP", + "stack_version": "2.5", + "hostnames": ["c6401.ambari.apache.org"] + }, + "dependencies": [] + }, + { + "StackServiceComponents": { + "advertise_version": "true", + "cardinality": "1+", + "component_category": "SLAVE", + "component_name": "NODEMANAGER", + "display_name": "NodeManager", + "is_client": "false", + "is_master": "false", + "hostnames": [ + "c6401.ambari.apache.org" + ] + }, + "dependencies": [] + }, + ] + } + ], + "changed-configurations": [ + { + u'old_value': u'3', + u'type': u'hive-interactive-site', + u'name': u'hive.server2.tez.sessions.per.default.queue' + } + ], + "configurations": { + "capacity-scheduler" : { + "properties" : { + "capacity-scheduler" : "null", + "yarn.scheduler.capacity.root.accessible-node-labels" : "*", + "yarn.scheduler.capacity.maximum-am-resource-percent" : "1", + "yarn.scheduler.capacity.root.acl_administer_queue" : "*", + 'yarn.scheduler.capacity.queue-mappings-override.enable' : 'false', + "yarn.scheduler.capacity.root.default.capacity" : "100", + "yarn.scheduler.capacity.root.default.user-limit-factor" : "1", + "yarn.scheduler.capacity.root.queues" : "default", + "yarn.scheduler.capacity.root.capacity" : "100", + "yarn.scheduler.capacity.root.default.acl_submit_applications" : "*", + "yarn.scheduler.capacity.root.default.maximum-capacity" : "100", + "yarn.scheduler.capacity.node-locality-delay" : "40", + "yarn.scheduler.capacity.maximum-applications" : "10000", + "yarn.scheduler.capacity.root.default.state" : "RUNNING" + } + }, + "hive-interactive-env": + { + 'properties': { + 'enable_hive_interactive': 'true', + 'llap_queue_capacity':'50' + } + }, + "hive-interactive-site": + { + 'properties': { + 'hive.llap.daemon.queue.name': 'default', + 'hive.server2.tez.sessions.per.default.queue': '4', + 'hive.tez.container.size':'4096' + } + }, + "hive-env": + { + 'properties': { + 'hive_user': 'hive' + } + }, + "yarn-site": { + "properties": { + "yarn.scheduler.minimum-allocation-mb": "1024", + "yarn.nodemanager.resource.memory-mb": "212992", + "yarn.nodemanager.resource.cpu-vcores": '25' + } + }, + "tez-interactive-site": { + "properties": { + "tez.am.resource.memory.mb": "4096" + } + }, + "hive-site": + { + 'properties': { + 'hive.tez.container.size': '1024' + } + }, + } + } + + clusterData = { + "cpu": 4, + "mapMemory": 30000, + "amMemory": 20000, + "reduceMemory": 20560, + "containers": 3, + "ramPerContainer": 82240, + "referenceNodeManagerHost" : { + "total_mem" : 328960 * 1024 + }, + "yarnMinContainerSize": 1024 + } + + configurations = { + } + + # Tests based on concurrency (hive.server2.tez.sessions.per.default.queue) config changes + + ################################################################### + # Test A: 'hive.server2.tez.sessions.per.default.queue' set to = 4 + ################################################################### + + # Test + self.stackAdvisor.recommendYARNConfigurations(configurations, clusterData, services, self.hosts_9_total) + self.assertTrue('capacity-scheduler' not in configurations) + self.assertEquals(configurations['hive-interactive-site']['property_attributes']['hive.server2.tez.sessions.per.default.queue'], {'maximum': '22'}) + + self.assertTrue(configurations['hive-interactive-env']['properties']['num_llap_nodes'], 3) + self.assertTrue('num_llap_nodes_for_llap_daemons' not in configurations['hive-interactive-env']['properties']) + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.daemon.yarn.container.mb'], '208896') + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.daemon.num.executors'], '25') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.threadpool.size'], '25') + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.memory.size'], '106496') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.enabled'], 'true') + + self.assertEqual(configurations['hive-interactive-env']['properties']['llap_heap_size'], '96256') + self.assertEqual(configurations['hive-interactive-env']['properties']['hive_heapsize'], '2048') + self.assertEqual(configurations['hive-interactive-env']['property_attributes']['num_llap_nodes'], {'maximum': '9', 'minimum': '1', 'read_only': 'true'}) + + self.assertEqual(configurations['hive-interactive-env']['properties']['slider_am_container_mb'], '1024') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.auto.convert.join.noconditionaltask.size'], '1145044992') + + self.assertTrue('tez.am.resource.memory.mb' not in configurations['tez-interactive-site']['properties']) + self.assertEquals(configurations['hive-interactive-site']['property_attributes']['hive.llap.daemon.queue.name'], {'entries': [{'value': 'default', 'label': 'default'}]}) + + + ################################################################## + # Test B: 'hive.server2.tez.sessions.per.default.queue' set to = 9 + ################################################################## + # Set the config + services['configurations']['hive-interactive-site']['properties']['hive.server2.tez.sessions.per.default.queue'] = 9 + + # Test + self.stackAdvisor.recommendYARNConfigurations(configurations, clusterData, services, self.hosts_9_total) + self.assertTrue('capacity-scheduler' not in configurations) + self.assertEquals(configurations['hive-interactive-site']['property_attributes']['hive.server2.tez.sessions.per.default.queue'], {'maximum': '22'}) + + self.assertTrue(configurations['hive-interactive-env']['properties']['num_llap_nodes'], 3) + self.assertTrue('num_llap_nodes_for_llap_daemons' not in configurations['hive-interactive-env']['properties']) + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.daemon.yarn.container.mb'], '207872') + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.daemon.num.executors'], '25') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.threadpool.size'], '25') + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.memory.size'], '105472') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.enabled'], 'true') + + self.assertEqual(configurations['hive-interactive-env']['properties']['llap_heap_size'], '96256') + self.assertEqual(configurations['hive-interactive-env']['properties']['hive_heapsize'], '3600') + self.assertEqual(configurations['hive-interactive-env']['property_attributes']['num_llap_nodes'], {'maximum': '9', 'minimum': '1', 'read_only': 'true'}) + + self.assertEqual(configurations['hive-interactive-env']['properties']['slider_am_container_mb'], '1024') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.auto.convert.join.noconditionaltask.size'], '1145044992') + + self.assertTrue('tez.am.resource.memory.mb' not in configurations['tez-interactive-site']['properties']) + self.assertEquals(configurations['hive-interactive-site']['property_attributes']['hive.llap.daemon.queue.name'], {'entries': [{'value': 'default', 'label': 'default'}]}) + + + ################################################################### + # Test C: 'hive.server2.tez.sessions.per.default.queue' set to = 10 + ################################################################### + # Set the config + services['configurations']['hive-interactive-site']['properties']['hive.server2.tez.sessions.per.default.queue'] = 10 + + # Test + self.stackAdvisor.recommendYARNConfigurations(configurations, clusterData, services, self.hosts_9_total) + self.assertTrue('capacity-scheduler' not in configurations) + self.assertEquals(configurations['hive-interactive-site']['property_attributes']['hive.server2.tez.sessions.per.default.queue'], {'maximum': '22'}) + + self.assertTrue(configurations['hive-interactive-env']['properties']['num_llap_nodes'], 3) + self.assertTrue('num_llap_nodes_for_llap_daemons' not in configurations['hive-interactive-env']['properties']) + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.daemon.yarn.container.mb'], '204800') + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.daemon.num.executors'], '25') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.threadpool.size'], '25') + + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.memory.size'], '102400') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.llap.io.enabled'], 'true') + + self.assertEqual(configurations['hive-interactive-env']['properties']['llap_heap_size'], '96256') + self.assertEqual(configurations['hive-interactive-env']['properties']['hive_heapsize'], '4000') + self.assertEqual(configurations['hive-interactive-env']['property_attributes']['num_llap_nodes'], {'maximum': '9', 'minimum': '1', 'read_only': 'true'}) + + self.assertEqual(configurations['hive-interactive-env']['properties']['slider_am_container_mb'], '1024') + self.assertEqual(configurations['hive-interactive-site']['properties']['hive.auto.convert.join.noconditionaltask.size'], '1145044992') + + self.assertTrue('tez.am.resource.memory.mb' not in configurations['tez-interactive-site']['properties']) + self.assertEquals(configurations['hive-interactive-site']['property_attributes']['hive.llap.daemon.queue.name'], {'entries': [{'value': 'default', 'label': 'default'}]}) + + + + # Test 16: (1). only 'default' queue exists at root level in capacity-scheduler, and # 'capacity-scheduler' configs are passed-in as single "/n" separated string and