Repository: ambari Updated Branches: refs/heads/trunk d7c5a1bbf -> d0da3f7c8
AMBARI-15704: Include an alert informing the number of segments marked down in gp_segment_configuration table (Goutam Tadi via mithmatt) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/d0da3f7c Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/d0da3f7c Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/d0da3f7c Branch: refs/heads/trunk Commit: d0da3f7c8f00b84240f3c2d0222c769db97efd13 Parents: d7c5a1b Author: Matt <mmat...@pivotal.io> Authored: Tue Apr 5 11:57:37 2016 -0700 Committer: Matt <mmat...@pivotal.io> Committed: Tue Apr 5 11:57:37 2016 -0700 ---------------------------------------------------------------------- .../common-services/HAWQ/2.0.0/alerts.json | 13 ++ .../alerts/alert_segment_registration_status.py | 117 +++++++++++++ .../test_alert_segment_registration_status.py | 170 +++++++++++++++++++ 3 files changed, 300 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/d0da3f7c/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json index 8da5beb..620cb90 100644 --- a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json @@ -42,6 +42,19 @@ } }, { + "name": "hawqsegments_registration_status", + "label": "HAWQ Segment Registration Status", + "description": "This alert is triggered when a HAWQ Segment node fails to register with the HAWQ Master.", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "HAWQ/2.0.0/package/alerts/alert_segment_registration_status.py", + "parameters": [] + } + }, + { "name": "hawq_master_process", "label": "HAWQ Master Process", "description": "This alert is triggered if the HAWQ Master process cannot be confirmed to be up and listening on the network.", http://git-wip-us.apache.org/repos/asf/ambari/blob/d0da3f7c/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_segment_registration_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_segment_registration_status.py b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_segment_registration_status.py new file mode 100644 index 0000000..4d09763 --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_segment_registration_status.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import os.path +import re +from resource_management.core.shell import call + +HAWQ_USER = 'gpadmin' +HAWQ_HOME='/usr/local/hawq' +HAWQ_GREENPLUM_PATH_FILE = "{0}/greenplum_path.sh".format(HAWQ_HOME) +HAWQ_SLAVES_FILE= "{0}/etc/slaves".format(HAWQ_HOME) +HAWQMASTER_PORT = '{{hawq-site/hawq_master_address_port}}' + +RESULT_STATE_OK = 'OK' +RESULT_STATE_WARNING = 'WARNING' +RESULT_STATE_UNKNOWN = 'UNKNOWN' +RESULT_STATE_SKIPPED = 'SKIPPED' + +logger = logging.getLogger('ambari_alerts') + + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used to build the dictionary passed into execute + """ + return ([HAWQMASTER_PORT]) + + +def execute(configurations={}, parameters={}, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if configurations is None: + logger.error("[Alert HAWQ] Configurations file is either not accessible or not present.") + return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) + logger.debug("Configuration File found") + if not os.path.isfile(HAWQ_SLAVES_FILE): + logger.error("[Alert HAWQ] Slaves file is not present in {0}".format(HAWQ_SLAVES_FILE)) + return (RESULT_STATE_SKIPPED, ['Slaves file is not present in /usr/local/hawq/etc']) + + try: + db_segment_list = get_segment_list_db(configurations[HAWQMASTER_PORT]) + ambari_segment_list = get_segment_list_ambari() + #Converted to set to omit any duplicates inserted into slaves file + segment_diff = (set(db_segment_list) ^ set(ambari_segment_list)) + segment_diff_len = len(segment_diff) + #segment_diff_len cannot be negative since this diff is calculated two ways. (eg: "A - B" & "B - A") + if not segment_diff_len : + return (RESULT_STATE_OK, ['All HAWQ Segments are registered.']) + msg = '{0} HAWQ Segments are not registered with HAWQ Master.'.format(segment_diff_len) if (segment_diff_len > 1) else '1 HAWQ Segment is not registered with HAWQ Master.' + logger.error(" [Alert HAWQ] Segments Unregistered: {0} are unregistered/down.".format(list(segment_diff))) + return (RESULT_STATE_WARNING, [msg + " Try restarting HAWQ service if a segment has been added/removed. Check the log file in /var/log/ambari-agent/ambari-alerts.log for more details on unregistered hosts."]) + except Exception, ex: + logger.error('[Alert HAWQ] Could not find HAWQ Segments registration status on {0}'.format(host_name)) + logger.exception(str(ex)) + + # Registration status cannot be determined + return (RESULT_STATE_UNKNOWN, ['HAWQ Segments Registration Status cannot be determined.']) + + +def get_segment_list_db(port): + """ + Gets the Segment registrations count from HAWQMASTER by running a SQL command. + """ + logger.debug("Fetching segment list from HAWQ Master Database.") + query = " SELECT hostname FROM gp_segment_configuration where role = 'p' and status = 'u' " + cmd = "source {0} && psql -p {1} -t -d template1 -c \"{2};\"".format(HAWQ_GREENPLUM_PATH_FILE, port, query) + + returncode, command_output = call(cmd, + user=HAWQ_USER, + timeout=60) + + if returncode: + raise + segment_list = [segment.strip() for segment in command_output.split('\n')] if command_output else [] + return [hostname.strip() for hostname in segment_list] + +def get_segment_list_ambari(): + """ + Gets the Segment count from HAWQMASTER host from /usr/local/hawq/etc/slaves saved from ambari configurations file. + """ + segment_list = [] + logger.debug("Fetching Slaves from Slaves file in {0}".format(HAWQ_SLAVES_FILE)) + try: + #regex to read all not empty lines in a file. + with open(HAWQ_SLAVES_FILE, "r") as slaves_file: + slaves = slaves_file.read() + segment_list = re.findall('\S+' , slaves) + return segment_list + except Exception as ex: + logger.error("[Alert HAWQ] Get Segment list from Slaves : Could not read slaves from {0}".format(HAWQ_SLAVES_FILE)) + raise ex + http://git-wip-us.apache.org/repos/asf/ambari/blob/d0da3f7c/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_segment_registration_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_segment_registration_status.py b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_segment_registration_status.py new file mode 100644 index 0000000..6bb5930 --- /dev/null +++ b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_segment_registration_status.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python + +''' +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +# System imports +import os +import sys +from resource_management.core.shell import call +from mock.mock import patch + +# Local imports +from stacks.utils.RMFTestCase import * + +COMMON_SERVICES_ALERTS_DIR = "HAWQ/2.0.0/package/alerts" + +file_path = os.path.dirname(os.path.abspath(__file__)) +file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file_path))))) +file_path = os.path.join(file_path, "main", "resources", "common-services", COMMON_SERVICES_ALERTS_DIR) + +RESULT_STATE_OK = 'OK' +RESULT_STATE_WARNING = 'WARNING' +RESULT_STATE_UNKNOWN = 'UNKNOWN' +RESULT_STATE_SKIPPED = 'SKIPPED' + +class TestAlertRegistrationStatus(RMFTestCase): + + HOST_LIST_A = ['HOST1','HOST2','HOST3','HOST4'] + HOST_LIST_B = ['HOST1','HOST3','HOST5','HOST4'] + HOST_LIST_C = ['HOST1','HOST2','HOST3'] + + def setUp(self): + """ + Import the class under test. + Because the class is present in a different folder, append its dir to the system path. + Also, shorten the import name and make it a global so the test functions can access it. + :return: + """ + sys.path.append(file_path) + global alert_segment_registration_status + import alert_segment_registration_status + + def test_missing_configs(self): + """ + Check if the status is UNKNOWN when configs are missing. + """ + configs = None + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_UNKNOWN) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'There were no configurations supplied to the script.') + + @patch("os.path.isfile", return_value=False) + def test_missing_slave_file(self, os_path_file_mock): + """ + Check if the status is SKIPPED when slaves file is missing. + """ + configs={ + "{{hawq-site/hawq_master_address_port}}": "5432" + } + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_SKIPPED) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'Slaves file is not present in /usr/local/hawq/etc') + + @patch("alert_segment_registration_status.get_segment_list_db") + @patch("alert_segment_registration_status.get_segment_list_ambari") + @patch("os.path.isfile", return_value=True) + def test_successful_registration_status(self, os_path_isfile_mock, get_segment_list_ambari_mock, get_segment_list_db_mock): + """ + Check if the status is OK if no difference in registration segment number and slaves count. + """ + get_segment_list_ambari_mock.return_value=self.HOST_LIST_A + get_segment_list_db_mock.return_value=self.HOST_LIST_A + configs={ + "{{hawq-site/hawq_master_address_port}}": "5432" + } + + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_OK) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'All HAWQ Segments are registered.') + + @patch("alert_segment_registration_status.get_segment_list_db") + @patch("alert_segment_registration_status.get_segment_list_ambari") + @patch("os.path.isfile", return_value=True) + def test_unsuccessful_registration_status_plural(self, os_path_isfile_mock, get_segment_list_ambari_mock, get_segment_list_db_mock): + """ + Check if the status is WARNING if a difference is present in registration segment number and slaves count. + """ + get_segment_list_ambari_mock.return_value=self.HOST_LIST_A + get_segment_list_db_mock.return_value=self.HOST_LIST_B + configs={ + "{{hawq-site/hawq_master_address_port}}": "5432" + } + + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_WARNING) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], '2 HAWQ Segments are not registered with HAWQ Master. Try restarting HAWQ service if a segment has been added/removed. Check the log file in /var/log/ambari-agent/ambari-alerts.log for more details on unregistered hosts.') + + @patch("alert_segment_registration_status.get_segment_list_db") + @patch("alert_segment_registration_status.get_segment_list_ambari") + @patch("os.path.isfile", return_value=True) + def test_unsuccessful_registration_status(self, os_path_isfile_mock, get_segment_list_ambari_mock, get_segment_list_db_mock): + """ + Check if the status is WARNING if a difference is present in registration segment number and slaves count. + """ + get_segment_list_ambari_mock.return_value=self.HOST_LIST_A + get_segment_list_db_mock.return_value=self.HOST_LIST_C + configs={ + "{{hawq-site/hawq_master_address_port}}": "5432" + } + + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_WARNING) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], '1 HAWQ Segment is not registered with HAWQ Master. Try restarting HAWQ service if a segment has been added/removed. Check the log file in /var/log/ambari-agent/ambari-alerts.log for more details on unregistered hosts.') + + @patch("alert_segment_registration_status.get_segment_list_db") + @patch("alert_segment_registration_status.get_segment_list_ambari") + @patch("os.path.isfile", return_value=True) + def test_exception_registration_status(self, os_path_isfile_mock, get_segment_list_ambari_mock, get_segment_list_db_mock): + """ + Check if the status is UNKNOWN if an exception is thrown when finding registration segment number and slaves count. + """ + get_segment_list_ambari_mock.return_value=self.HOST_LIST_A + get_segment_list_db_mock.side_effect=Exception("Exception raised to fail") + configs={ + "{{hawq-site/hawq_master_address_port}}": "5432" + } + + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_UNKNOWN) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Segments Registration Status cannot be determined.') + + @patch("alert_segment_registration_status.get_segment_list_db") + @patch("alert_segment_registration_status.get_segment_list_ambari") + @patch("os.path.isfile", return_value=True) + def test_unsuccessful_empty_db_registration_status(self, os_path_isfile_mock, get_segment_list_ambari_mock, get_segment_list_db_mock): + """ + Check if the status is WARNING if a difference is present in registration segment number and slaves count. + """ + get_segment_list_ambari_mock.return_value=[] + get_segment_list_db_mock.return_value=self.HOST_LIST_C + configs={ + "{{hawq-site/hawq_master_address_port}}": "5432" + } + + [status, messages] = alert_segment_registration_status.execute(configurations=configs) + self.assertEqual(status, RESULT_STATE_WARNING) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], '3 HAWQ Segments are not registered with HAWQ Master. Try restarting HAWQ service if a segment has been added/removed. Check the log file in /var/log/ambari-agent/ambari-alerts.log for more details on unregistered hosts.') +