Repository: ambari Updated Branches: refs/heads/trunk de3667a21 -> 80afc9f02
AMBARI-10061 Alert Failures on Windows (echekanskiy via fbarca) Alerts for oozie, storm and ams are broken on windows os. Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/80afc9f0 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/80afc9f0 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/80afc9f0 Branch: refs/heads/trunk Commit: 80afc9f027244eb17cd32c778f47e0bfd2398bf0 Parents: de3667a Author: Florian Barca <fba...@hortonworks.com> Authored: Tue Mar 17 03:25:57 2015 -0700 Committer: Florian Barca <fba...@hortonworks.com> Committed: Tue Mar 17 03:25:57 2015 -0700 ---------------------------------------------------------------------- .../alerts/alert_ambari_metrics_monitor.py | 18 ++ .../configuration/falcon-startup.properties.xml | 26 --- .../package/alerts/alert_check_oozie_server.py | 105 ++++++----- .../HDPWIN/2.1/services/STORM/alerts.json | 174 +++++++++++++++++++ .../package/alerts/check_supervisor_process.py | 49 ++++++ 5 files changed, 304 insertions(+), 68 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py index 3e87e25..04a2e01 100644 --- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py +++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py @@ -23,7 +23,11 @@ import socket from resource_management.libraries.functions.check_process_status import check_process_status from resource_management.core.exceptions import ComponentIsNotRunning +from ambari_commons import OSCheck, OSConst +from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl +if OSCheck.is_windows_family(): + from resource_management.libraries.functions.windows_service_utils import check_windows_service_status RESULT_CODE_OK = 'OK' RESULT_CODE_CRITICAL = 'CRITICAL' RESULT_CODE_UNKNOWN = 'UNKNOWN' @@ -37,7 +41,21 @@ def get_tokens(): """ return (AMS_MONITOR_PID_DIR,) +@OsFamilyFuncImpl(OSConst.WINSRV_FAMILY) +def is_monitor_process_live(pid_file=None): + """ + Gets whether the Metrics Monitor Service is running. + :param pid_file: ignored + :return: True if the monitor is running, False otherwise + """ + try: + check_windows_service_status("AmbariMetricsHostMonitoring") + ams_monitor_process_running = True + except: + ams_monitor_process_running = False + return ams_monitor_process_running +@OsFamilyFuncImpl(OsFamilyImpl.DEFAULT) def is_monitor_process_live(pid_file): """ Gets whether the Metrics Monitor represented by the specified file is running. http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml b/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml index 252fed4..6a35c17 100644 --- a/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml +++ b/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml @@ -183,30 +183,4 @@ <value>DEFAULT</value> <description>The kerberos names rules is to resolve kerberos principal names, refer to Hadoop's KerberosName for more details.</description> </property> - <!--kerberos params, must be set during security enabling--> - <property> - <name>*.falcon.service.authentication.kerberos.principal</name> - <value>falcon/_h...@example.com</value> - <description></description> - </property> - <property> - <name>*.falcon.service.authentication.kerberos.keytab</name> - <value>/etc/security/keytabs/falcon.service.keytab</value> - <description></description> - </property> - <property> - <name>*.dfs.namenode.kerberos.principal</name> - <value>nn/_h...@example.com</value> - <description>name node principal to talk to config store</description> - </property> - <property> - <name>*.falcon.http.authentication.kerberos.principal</name> - <value>HTTP/_h...@example.com</value> - <description>Indicates the Kerberos principal to be used for HTTP endpoint</description> - </property> - <property> - <name>*.falcon.http.authentication.kerberos.keytab</name> - <value>/etc/security/keytabs/spnego.service.keytab</value> - <description>Location of the keytab file with the credentials for the HTTP principal</description> - </property> </configuration> http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py index 9e2775b..9e65e6b 100644 --- a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py +++ b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py @@ -24,7 +24,8 @@ from resource_management.libraries.functions import format from resource_management.libraries.functions import get_kinit_path from resource_management.libraries.functions import get_klist_path from ambari_commons.os_check import OSConst, OSCheck -from os import getpid, sep +from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl +import os from urlparse import urlparse RESULT_CODE_OK = 'OK' @@ -36,6 +37,17 @@ SECURITY_ENABLED = '{{cluster-env/security_enabled}}' OOZIE_PRINCIPAL = '{{oozie-site/oozie.authentication.kerberos.principal}}' OOZIE_KEYTAB = '{{oozie-site/oozie.authentication.kerberos.keytab}}' +class KerberosPropertiesNotFound(Exception): pass + +@OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY) +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (OOZIE_URL_KEY,) + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -43,6 +55,52 @@ def get_tokens(): """ return (OOZIE_URL_KEY, OOZIE_PRINCIPAL, SECURITY_ENABLED, OOZIE_KEYTAB) +@OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY) +def get_check_command(oozie_url, host_name, parameters): + from resource_management.libraries.functions import reload_windows_env + reload_windows_env() + oozie_home = os.environ['OOZIE_HOME'] + command = format("{oozie_home}\\bin\\oozie.cmd admin -oozie {oozie_url} -status") + return (command, None) + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def get_check_command(oozie_url, host_name, parameters): + security_enabled = False + if SECURITY_ENABLED in parameters: + security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' + kerberos_env = None + if security_enabled: + if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters: + oozie_keytab = parameters[OOZIE_KEYTAB] + oozie_principal = parameters[OOZIE_PRINCIPAL] + + # substitute _HOST in kerberos principal with actual fqdn + oozie_principal = oozie_principal.replace('_HOST', host_name) + else: + raise KerberosPropertiesNotFound('The Oozie keytab and principal are required parameters when security is enabled.') + + # Create the kerberos credentials cache (ccache) file and set it in the environment to use + # when executing curl + env = Environment.get_instance() + ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) + kerberos_env = {'KRB5CCNAME': ccache_file} + + klist_path_local = get_klist_path() + klist_command = format("{klist_path_local} -s {ccache_file}") + + # Determine if we need to kinit by testing to see if the relevant cache exists and has + # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number + # it kinits we do but recover quickly when keytabs are regenerated + return_code, _ = call(klist_command) + if return_code != 0: + kinit_path_local = get_kinit_path() + kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ") + + # kinit + Execute(kinit_command, environment=kerberos_env) + command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status") + return (command, kerberos_env) + def execute(parameters=None, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label @@ -65,50 +123,13 @@ def execute(parameters=None, host_name=None): oozie_url = parameters[OOZIE_URL_KEY] oozie_url = oozie_url.replace(urlparse(oozie_url).hostname,localhost_address) - security_enabled = False - if SECURITY_ENABLED in parameters: - security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' - - command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status") - try: - # kinit if security is enabled so that oozie-env.sh can make the web request - kerberos_env = None - - if security_enabled: - if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters: - oozie_keytab = parameters[OOZIE_KEYTAB] - oozie_principal = parameters[OOZIE_PRINCIPAL] - - # substitute _HOST in kerberos principal with actual fqdn - oozie_principal = oozie_principal.replace('_HOST', host_name) - else: - return (RESULT_CODE_UNKNOWN, ['The Oozie keytab and principal are required parameters when security is enabled.']) - - # Create the kerberos credentials cache (ccache) file and set it in the environment to use - # when executing curl - env = Environment.get_instance() - ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, sep, getpid()) - kerberos_env = {'KRB5CCNAME': ccache_file} - - klist_path_local = get_klist_path() - klist_command = format("{klist_path_local} -s {ccache_file}") - - # Determine if we need to kinit by testing to see if the relevant cache exists and has - # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number - # it kinits we do but recover quickly when keytabs are regenerated - return_code, _ = call(klist_command) - if return_code != 0: - kinit_path_local = get_kinit_path() - kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ") - - # kinit - Execute(kinit_command, environment=kerberos_env) - + command, env = get_check_command(oozie_url, host_name, parameters) # execute the command - Execute(command, environment=kerberos_env) + Execute(command, environment=env) return (RESULT_CODE_OK, ["Successful connection to {0}".format(oozie_url)]) - + except KerberosPropertiesNotFound, ex: + return (RESULT_CODE_UNKNOWN, [str(ex)]) except Exception, ex: return (RESULT_CODE_CRITICAL, [str(ex)]) http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json new file mode 100644 index 0000000..babf7cf --- /dev/null +++ b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json @@ -0,0 +1,174 @@ +{ + "STORM": { + "service": [ + { + "name": "storm_supervisor_process_percent", + "label": "Percent Supervisors Available", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "storm_supervisor_process", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.1 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.3 + } + } + } + } + ], + "STORM_UI_SERVER": [ + { + "name": "storm_server_process", + "label": "Storm Server Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{storm-site/ui.port}}", + "default_port": 8744, + "reporting": { + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + } + } + }, + { + "name": "storm_webui", + "label": "Storm Web UI", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "WEB", + "uri": { + "http": "{{storm-site/ui.port}}" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.3f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.3f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + } + ], + "NIMBUS": [ + { + "name": "storm_nimbus_process", + "label": "Nimbus Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{storm-site/nimbus.thrift.port}}", + "default_port": 6627, + "reporting": { + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + } + } + } + ], + "DRPC_SERVER": [ + { + "name": "storm_drpc_server", + "label": "DRPC Server Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{storm-site/drpc.port}}", + "default_port": 3772, + "reporting": { + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + } + } + } + ], + "STORM_REST_API": [ + { + "name": "storm_rest_api", + "label": "Storm REST API", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "8745", + "default_port": 8745, + "reporting": { + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + } + } + } + ], + "SUPERVISOR": [ + { + "name": "storm_supervisor_process", + "label": "Supervisor Process", + "interval": 1, + "scope": "HOST", + "source": { + "type": "SCRIPT", + "path": "HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py" + } + } + ] + } +} http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py new file mode 100644 index 0000000..dcae64a --- /dev/null +++ b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from resource_management.libraries.functions import check_windows_service_status + + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return () + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + try: + check_windows_service_status("supervisor") + return (RESULT_CODE_OK, ["Supervisor is running"]) + except: + return (RESULT_CODE_CRITICAL, ["Supervisor is stopped"])