Repository: ambari Updated Branches: refs/heads/branch-2.2 15933088c -> 27e510b82
AMBARI-15105: Add alerts for HAWQ components status (bhuvnesh2703 via jaoki) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/27e510b8 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/27e510b8 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/27e510b8 Branch: refs/heads/branch-2.2 Commit: 27e510b82aefd61b31d3da4f420b455689516603 Parents: 1593308 Author: Jun Aoki <ja...@apache.org> Authored: Thu Feb 25 15:46:52 2016 -0800 Committer: Jun Aoki <ja...@apache.org> Committed: Thu Feb 25 15:46:52 2016 -0800 ---------------------------------------------------------------------- .../common-services/HAWQ/2.0.0/alerts.json | 93 +++++++++++- .../package/alerts/alert_component_status.py | 76 ++++++++++ .../2.3/HAWQ/test_alert_component_status.py | 141 +++++++++++++++++++ ambari-web/app/views/main/dashboard/widgets.js | 2 +- 4 files changed, 310 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/27e510b8/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json index 3119a0c..14ad6d7 100644 --- a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/alerts.json @@ -1,5 +1,32 @@ { "HAWQ": { + "service": [ + { + "name": "hawq_segment_process_percent", + "label": "Percent HAWQ Segments Available", + "description": "This alert is triggered if the number of down HAWQ Segments in the cluster is greater than the configured critical threshold.", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "hawq_segment_process", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.1 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.3 + } + } + } + } + ], "HAWQMASTER": [ { "name": "hawqstandby_sync_status", @@ -13,7 +40,71 @@ "path": "HAWQ/2.0.0/package/alerts/alert_sync_status.py", "parameters": [] } + }, + { + "name": "hawq_master_process", + "label": "HAWQ Master Process", + "description": "This alert is triggered if the HAWQ Master process cannot be confirmed to be up and listening on the network.", + "interval": 1, + "scope": "ANY", + "source": { + "type": "SCRIPT", + "path": "HAWQ/2.0.0/package/alerts/alert_component_status.py", + "parameters": [ + { + "name": "component_name", + "display_name": "Component Name", + "value": "master", + "type": "STRING", + "description": "This text string indicates if it is a Master, Standby or Segment" + } + ] + } + } + ], + "HAWQSEGMENT": [ + { + "name": "hawq_segment_process", + "label": "HAWQ Segment Process", + "description": "This host-level alert is triggered if the HAWQ Segment process cannot be confirmed to be up and listening on the network.", + "interval": 1, + "scope": "HOST", + "source": { + "type": "SCRIPT", + "path": "HAWQ/2.0.0/package/alerts/alert_component_status.py", + "parameters": [ + { + "name": "component_name", + "display_name": "Component Name", + "value": "segment", + "type": "STRING", + "description": "This text string indicates if it is a Master, Standby or Segment" + } + ] + } + } + ], + "HAWQSTANDBY": [ + { + "name": "hawq_standby_process", + "label": "HAWQ Standby Process", + "description": "This alert is triggered if the HAWQ Standby process cannot be confirmed to be up and listening on the network.", + "interval": 1, + "scope": "ANY", + "source": { + "type": "SCRIPT", + "path": "HAWQ/2.0.0/package/alerts/alert_component_status.py", + "parameters": [ + { + "name": "component_name", + "display_name": "Component Name", + "value": "standby", + "type": "STRING", + "description": "This text string indicates if it is a Master, Standby or Segment" + } + ] + } } ] } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/ambari/blob/27e510b8/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py new file mode 100644 index 0000000..9ca9ac6 --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HAWQ/2.0.0/package/alerts/alert_component_status.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from resource_management.core.shell import call + +HAWQMASTER_PORT = '{{hawq-site/hawq_master_address_port}}' +HAWQSEGMENT_PORT = '{{hawq-site/hawq_segment_address_port}}' +HAWQSTANDBY_ADDRESS = '{{hawq-site/hawq_standby_address_host}}' + +RESULT_STATE_OK = 'OK' +RESULT_STATE_UNKNOWN = 'UNKNOWN' +RESULT_STATE_SKIPPED = 'SKIPPED' +RESULT_STATE_CRITICAL = 'CRITICAL' + +COMPONENT_PROCESS_MAP = { + "segment": "postgres", + "master": "postgres", + "standby": "gpsyncmaster" + } + + + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used to build the dictionary passed into execute + """ + return (HAWQMASTER_PORT, HAWQSEGMENT_PORT, HAWQSTANDBY_ADDRESS) + + +def execute(configurations={}, parameters={}, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if configurations is None: + return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) + + component = parameters['component_name'] + # Identify port of the process + port = configurations[HAWQSEGMENT_PORT] if component == "segment" else configurations[HAWQMASTER_PORT] + + component_name = component.capitalize() + is_running = is_component_running(port, COMPONENT_PROCESS_MAP[component]) + if is_running: + return (RESULT_STATE_OK, ['HAWQ {0} is running'.format(component_name)]) + else: + return (RESULT_STATE_CRITICAL, ['HAWQ {0} is not running'.format(component_name)]) + +def is_component_running(port, process): + """ + Check if the process is running on the specified port + """ + cmd = "netstat -tupln | egrep ':{0}\s' | egrep {1}".format(port, process) + rc, op= call(cmd, timeout=60) + return rc == 0 http://git-wip-us.apache.org/repos/asf/ambari/blob/27e510b8/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py new file mode 100644 index 0000000..b2e1d4d --- /dev/null +++ b/ambari-server/src/test/python/stacks/2.3/HAWQ/test_alert_component_status.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +''' +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +# System imports +import os +import sys + +from mock.mock import patch + +# Local imports +from stacks.utils.RMFTestCase import * + +COMMON_SERVICES_ALERTS_DIR = "HAWQ/2.0.0/package/alerts" + +file_path = os.path.dirname(os.path.abspath(__file__)) +file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file_path))))) +file_path = os.path.join(file_path, "main", "resources", "common-services", COMMON_SERVICES_ALERTS_DIR) + +WORKING_CONFIGS = { + "{{hawq-site/hawq_master_address_port}}": "5432", + "{{hawq-site/hawq_segment_address_port}}": "40000", + "{{hawq-site/hawq_standby_address_host}}": "c6402.ambari.apache.org" + } + +class TestAlertComponentStatus(RMFTestCase): + + def setUp(self): + """ + Import the class under test. + Because the class is present in a different folder, append its dir to the system path. + Also, shorten the import name and make it a global so the test functions can access it. + :return: + """ + sys.path.append(file_path) + global alert_component_status + import alert_component_status + + def test_missing_configs(self): + """ + Check that the status is UNKNOWN when configs are missing. + """ + configs = None + [status, messages] = alert_component_status.execute(configurations=configs) + self.assertEqual(status, alert_component_status.RESULT_STATE_UNKNOWN) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'There were no configurations supplied to the script.') + + @patch("alert_component_status.is_component_running") + def test_hawq_master_ok(self, is_component_running_mock): + """ + Test that the status is OK when HAWQ Master is up + """ + # Mock calls + is_component_running_mock.return_value = True + + [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'master'}) + self.assertEqual(status, alert_component_status.RESULT_STATE_OK) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Master is running') + + @patch("alert_component_status.is_component_running") + def test_hawq_master_critical(self, is_component_running_mock): + """ + Test that the status is CRITICIAL when HAWQ Master is down + """ + # Mock calls + is_component_running_mock.return_value = False + + [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'master'}) + self.assertEqual(status, alert_component_status.RESULT_STATE_CRITICAL) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Master is not running') + + @patch("alert_component_status.is_component_running") + def test_hawq_standby_ok(self, is_component_running_mock): + """ + Test that the status is OK when HAWQ Standby is up + """ + # Mock calls + is_component_running_mock.return_value = True + + [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'standby'}) + self.assertEqual(status, alert_component_status.RESULT_STATE_OK) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Standby is running') + + @patch("alert_component_status.is_component_running") + def test_hawq_standby_critical(self, is_component_running_mock): + """ + Test that the status is CRITICIAL when HAWQ Standby is down + """ + # Mock calls + is_component_running_mock.return_value = False + + [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'standby'}) + self.assertEqual(status, alert_component_status.RESULT_STATE_CRITICAL) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Standby is not running') + + @patch("alert_component_status.is_component_running") + def test_hawq_segment_ok(self, is_component_running_mock): + """ + Test that the status is OK when HAWQ Segment is up + """ + # Mock calls + is_component_running_mock.return_value = True + + [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'segment'}) + self.assertEqual(status, alert_component_status.RESULT_STATE_OK) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Segment is running') + + @patch("alert_component_status.is_component_running") + def test_hawq_segment_critical(self, is_component_running_mock): + """ + Test that the status is CRITICIAL when HAWQ Segment is down + """ + # Mock calls + is_component_running_mock.return_value = False + + [status, messages] = alert_component_status.execute(configurations=WORKING_CONFIGS, parameters={'component_name': 'segment'}) + self.assertEqual(status, alert_component_status.RESULT_STATE_CRITICAL) + self.assertTrue(messages is not None and len(messages) == 1) + self.assertEqual(messages[0], 'HAWQ Segment is not running') http://git-wip-us.apache.org/repos/asf/ambari/blob/27e510b8/ambari-web/app/views/main/dashboard/widgets.js ---------------------------------------------------------------------- diff --git a/ambari-web/app/views/main/dashboard/widgets.js b/ambari-web/app/views/main/dashboard/widgets.js index 4786944..46a48ec 100644 --- a/ambari-web/app/views/main/dashboard/widgets.js +++ b/ambari-web/app/views/main/dashboard/widgets.js @@ -478,7 +478,7 @@ App.MainDashboardWidgetsView = Em.View.extend(App.UserPref, App.LocalStorage, Ap visible: [], hidden: [], threshold: {1: [80, 90], 2: [85, 95], 3: [90, 95], 4: [80, 90], 5: [1000, 3000], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [70, 90], 14: [150, 250], 15: [3, 10], 16: [], - 17: [70, 90], 18: [], 19: [50, 75], 20: [50, 75], 21: [85, 95], 22: [85, 95], 23: [], 24: [80, 90]} // id:[thresh1, thresh2] + 17: [70, 90], 18: [], 19: [50, 75], 20: [50, 75], 21: [85, 95], 22: [85, 95], 23: [], 24: [70, 90]} // id:[thresh1, thresh2] }), /**