AMBARI-18929. Yarn service check fails when either resource manager is down in HA enabled cluster (Weiwei Yang via alejandro)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/88e0c29e Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/88e0c29e Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/88e0c29e Branch: refs/heads/branch-dev-patch-upgrade Commit: 88e0c29e0617f05c0ecb72a75e74b2bb3def6bac Parents: 6100be6 Author: Alejandro Fernandez <afernan...@hortonworks.com> Authored: Thu Dec 1 09:45:56 2016 -0800 Committer: Alejandro Fernandez <afernan...@hortonworks.com> Committed: Thu Dec 1 09:45:56 2016 -0800 ---------------------------------------------------------------------- .../2.1.0.2.0/package/scripts/service_check.py | 66 +++++++---- .../2.0.6/YARN/test_yarn_service_check.py | 111 ++++++++++--------- 2 files changed, 100 insertions(+), 77 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/88e0c29e/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/scripts/service_check.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/scripts/service_check.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/scripts/service_check.py index c0bd480..b934767 100644 --- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/scripts/service_check.py +++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/scripts/service_check.py @@ -130,34 +130,56 @@ class ServiceCheckDefault(ServiceCheck): if "application" in item: application_name = item - for rm_webapp_address in params.rm_webapp_addresses_list: - info_app_url = params.scheme + "://" + rm_webapp_address + "/ws/v1/cluster/apps/" + application_name + # Find out the active RM from RM list + # Raise an exception if the active rm cannot be determined + active_rm_webapp_address = self.get_active_rm_webapp_address() + Logger.info("Active Resource Manager web app address is : " + active_rm_webapp_address); - get_app_info_cmd = "curl --negotiate -u : -ks --location-trusted --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + info_app_url + # Verify job state from active resource manager via rest api + info_app_url = params.scheme + "://" + active_rm_webapp_address + "/ws/v1/cluster/apps/" + application_name + get_app_info_cmd = "curl --negotiate -u : -ks --location-trusted --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + info_app_url - return_code, stdout, _ = get_user_call_output(get_app_info_cmd, - user=params.smokeuser, - path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', - ) + return_code, stdout, _ = get_user_call_output(get_app_info_cmd, + user=params.smokeuser, + path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', + ) - # Handle HDP<2.2.8.1 where RM doesn't do automatic redirection from standby to active - if stdout.startswith("This is standby RM. Redirecting to the current active RM:"): - Logger.info(format("Skipped checking of {rm_webapp_address} since returned '{stdout}'")) - continue + try: + json_response = json.loads(stdout) + except Exception as e: + raise Fail(format("Response from YARN API was not a valid JSON. Response: {stdout}")) - try: - json_response = json.loads(stdout) - except Exception as e: - raise Fail(format("Response from YARN API was not a valid JSON. Response: {stdout}")) - - if json_response is None or 'app' not in json_response or \ - 'state' not in json_response['app'] or 'finalStatus' not in json_response['app']: - raise Fail("Application " + app_url + " returns invalid data.") - - if json_response['app']['state'] != "FINISHED" or json_response['app']['finalStatus'] != "SUCCEEDED": - raise Fail("Application " + app_url + " state/status is not valid. Should be FINISHED/SUCCEEDED.") + if json_response is None or 'app' not in json_response or \ + 'state' not in json_response['app'] or 'finalStatus' not in json_response['app']: + raise Fail("Application " + app_url + " returns invalid data.") + if json_response['app']['state'] != "FINISHED" or json_response['app']['finalStatus'] != "SUCCEEDED": + raise Fail("Application " + app_url + " state/status is not valid. Should be FINISHED/SUCCEEDED.") + def get_active_rm_webapp_address(self): + import params + active_rm_webapp_address = None + rm_webapp_addresses = params.rm_webapp_addresses_list + if rm_webapp_addresses is not None and len(rm_webapp_addresses) > 0: + for rm_webapp_address in rm_webapp_addresses: + rm_state_url = params.scheme + "://" + rm_webapp_address + "/ws/v1/cluster/info" + get_cluster_info_cmd = "curl --negotiate -u : -ks --location-trusted --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + rm_state_url + try: + return_code, stdout, _ = get_user_call_output(get_cluster_info_cmd, + user=params.smokeuser, + path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', + ) + json_response = json.loads(stdout) + if json_response is not None and 'clusterInfo' in json_response \ + and json_response['clusterInfo']['haState'] == "ACTIVE": + active_rm_webapp_address = rm_webapp_address + break + except Exception as e: + Logger.warning(format("Cluster info is not available from calling {get_cluster_info_cmd}")) + + if active_rm_webapp_address is None: + raise Fail('Resource Manager state is not available. Failed to determine the active Resource Manager web application address from {0}'.format(','.join(rm_webapp_addresses))); + return active_rm_webapp_address if __name__ == "__main__": ServiceCheck().execute() http://git-wip-us.apache.org/repos/asf/ambari/blob/88e0c29e/ambari-server/src/test/python/stacks/2.0.6/YARN/test_yarn_service_check.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/YARN/test_yarn_service_check.py b/ambari-server/src/test/python/stacks/2.0.6/YARN/test_yarn_service_check.py index bb671aa..fe7456d 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/YARN/test_yarn_service_check.py +++ b/ambari-server/src/test/python/stacks/2.0.6/YARN/test_yarn_service_check.py @@ -22,11 +22,11 @@ import re from mock.mock import MagicMock, call, patch from stacks.utils.RMFTestCase import * -curl_call = MagicMock(return_value=(0, "{ \"app\": {\"state\": \"FINISHED\",\"finalStatus\": \"SUCCEEDED\"}}",'')) +curl_returns = [(0, "{\"clusterInfo\":{\"id\": \"1471586271500\",\"haState\": \"ACTIVE\"}}",''), + (0, "{\"app\":{\"state\": \"FINISHED\",\"finalStatus\":\"SUCCEEDED\"}}",'')] @patch("platform.linux_distribution", new = MagicMock(return_value="Linux")) @patch("sys.executable", new = '/usr/bin/python2.6') -@patch("resource_management.libraries.functions.get_user_call_output.get_user_call_output", new = curl_call) class TestServiceCheck(RMFTestCase): COMMON_SERVICES_PACKAGE_DIR = "YARN/2.1.0.2.0/package" STACK_VERSION = "2.0.6" @@ -38,32 +38,32 @@ class TestServiceCheck(RMFTestCase): re_search_mock.return_value = m m.group.return_value = "http://c6402.ambari.apache.org:8088/proxy/application_1429699682952_0010/" - self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/service_check.py", - classname="ServiceCheck", - command="service_check", - config_file="default.json", - stack_version = self.STACK_VERSION, - target = RMFTestCase.TARGET_COMMON_SERVICES, - checked_call_mocks = [(0, "some test text, appTrackingUrl=http:" - "//c6402.ambari.apache.org:8088/proxy/application_1429885383763_0001/, some test text")] - ) - self.assertResourceCalled('HdfsResource', '/user/ambari-qa', - immutable_paths = self.DEFAULT_IMMUTABLE_PATHS, - security_enabled = False, - hadoop_bin_dir = '/usr/bin', - keytab = UnknownConfigurationMock(), - kinit_path_local = '/usr/bin/kinit', - user = 'hdfs', - dfs_type = '', - mode = 0770, - owner = 'ambari-qa', - action = ['create_on_execute'], hdfs_resource_ignore_file='/var/lib/ambari-agent/data/.hdfs_resource_ignore', hdfs_site=self.getConfig()['configurations']['hdfs-site'], principal_name=UnknownConfigurationMock(), default_fs='hdfs://c6401.ambari.apache.org:8020', - hadoop_conf_dir = '/etc/hadoop/conf', - type = 'directory', - ) - self.assertCurlCallForwardsCredentialsOnRedirect() - self.assertNoMoreResources() - + with patch("resource_management.libraries.functions.get_user_call_output.get_user_call_output", side_effect = curl_returns) as mock_curl: + self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/service_check.py", + classname="ServiceCheck", + command="service_check", + config_file="default.json", + stack_version = self.STACK_VERSION, + target = RMFTestCase.TARGET_COMMON_SERVICES, + checked_call_mocks = [(0, "some test text, appTrackingUrl=http:" + "//c6402.ambari.apache.org:8088/proxy/application_1429885383763_0001/, some test text")] + ) + self.assertResourceCalled('HdfsResource', '/user/ambari-qa', + immutable_paths = self.DEFAULT_IMMUTABLE_PATHS, + security_enabled = False, + hadoop_bin_dir = '/usr/bin', + keytab = UnknownConfigurationMock(), + kinit_path_local = '/usr/bin/kinit', + user = 'hdfs', + dfs_type = '', + mode = 0770, + owner = 'ambari-qa', + action = ['create_on_execute'], hdfs_resource_ignore_file='/var/lib/ambari-agent/data/.hdfs_resource_ignore', hdfs_site=self.getConfig()['configurations']['hdfs-site'], principal_name=UnknownConfigurationMock(), default_fs='hdfs://c6401.ambari.apache.org:8020', + hadoop_conf_dir = '/etc/hadoop/conf', + type = 'directory', + ) + self.assertCurlCallForwardsCredentialsOnRedirect(mock_curl_call = mock_curl) + self.assertNoMoreResources() @patch("re.search") def test_service_check_secured(self, re_search_mock): @@ -71,31 +71,32 @@ class TestServiceCheck(RMFTestCase): re_search_mock.return_value = m m.group.return_value = "http://c6402.ambari.apache.org:8088/proxy/application_1429699682952_0010/" - self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/service_check.py", - classname="ServiceCheck", - command="service_check", - config_file="secured.json", - stack_version = self.STACK_VERSION, - target = RMFTestCase.TARGET_COMMON_SERVICES, - checked_call_mocks = [(0, "some test text, appTrackingUrl=http:" - "//c6402.ambari.apache.org:8088/proxy/application_1429885383763_0001/, some test text")] - ) - self.assertResourceCalled('HdfsResource', '/user/ambari-qa', - immutable_paths = self.DEFAULT_IMMUTABLE_PATHS, - security_enabled = True, - hadoop_bin_dir = '/usr/bin', - keytab = '/etc/security/keytabs/hdfs.headless.keytab', - kinit_path_local = '/usr/bin/kinit', - user = 'hdfs', - dfs_type = '', - mode = 0770, - owner = 'ambari-qa', - action = ['create_on_execute'], hdfs_resource_ignore_file='/var/lib/ambari-agent/data/.hdfs_resource_ignore', hdfs_site=self.getConfig()['configurations']['hdfs-site'], principal_name='hdfs', default_fs='hdfs://c6401.ambari.apache.org:8020', - hadoop_conf_dir = '/etc/hadoop/conf', - type = 'directory', - ) - self.assertCurlCallForwardsCredentialsOnRedirect() - self.assertNoMoreResources() + with patch("resource_management.libraries.functions.get_user_call_output.get_user_call_output", side_effect = curl_returns) as mock_curl: + self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/service_check.py", + classname="ServiceCheck", + command="service_check", + config_file="secured.json", + stack_version = self.STACK_VERSION, + target = RMFTestCase.TARGET_COMMON_SERVICES, + checked_call_mocks = [(0, "some test text, appTrackingUrl=http:" + "//c6402.ambari.apache.org:8088/proxy/application_1429885383763_0001/, some test text")] + ) + self.assertResourceCalled('HdfsResource', '/user/ambari-qa', + immutable_paths = self.DEFAULT_IMMUTABLE_PATHS, + security_enabled = True, + hadoop_bin_dir = '/usr/bin', + keytab = '/etc/security/keytabs/hdfs.headless.keytab', + kinit_path_local = '/usr/bin/kinit', + user = 'hdfs', + dfs_type = '', + mode = 0770, + owner = 'ambari-qa', + action = ['create_on_execute'], hdfs_resource_ignore_file='/var/lib/ambari-agent/data/.hdfs_resource_ignore', hdfs_site=self.getConfig()['configurations']['hdfs-site'], principal_name='hdfs', default_fs='hdfs://c6401.ambari.apache.org:8020', + hadoop_conf_dir = '/etc/hadoop/conf', + type = 'directory', + ) + self.assertCurlCallForwardsCredentialsOnRedirect(mock_curl_call = mock_curl) + self.assertNoMoreResources() - def assertCurlCallForwardsCredentialsOnRedirect(self): - self.assertIn('--location-trusted', curl_call.call_args[0][0]) \ No newline at end of file + def assertCurlCallForwardsCredentialsOnRedirect(self, mock_curl_call): + self.assertIn('--location-trusted', mock_curl_call.call_args[0][0])