Repository: ambari Updated Branches: refs/heads/trunk 0d2ac4f1b -> 92726b954
AMBARI-20467 - Add alerts for Livy in Spark and Spark 2 (Mingjie Tang via jonathanhurley) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/7a0c9e46 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/7a0c9e46 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/7a0c9e46 Branch: refs/heads/trunk Commit: 7a0c9e469596046f8c71bd790c67851429b77e26 Parents: 0d2ac4f Author: Jonathan Hurley <jhur...@hortonworks.com> Authored: Fri Apr 28 08:46:30 2017 -0400 Committer: Jonathan Hurley <jhur...@hortonworks.com> Committed: Fri Apr 28 08:46:30 2017 -0400 ---------------------------------------------------------------------- .../common-services/SPARK/1.2.1/alerts.json | 24 +++ .../scripts/alerts/alert_spark_livy_port.py | 146 +++++++++++++++++++ .../common-services/SPARK2/2.0.0/alerts.json | 24 +++ .../scripts/alerts/alert_spark2_livy_port.py | 146 +++++++++++++++++++ 4 files changed, 340 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/7a0c9e46/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json index 0e38f16..d3c1a59 100644 --- a/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json +++ b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json @@ -27,6 +27,30 @@ } } } + ], + "LIVY_SERVER": [ + { + "name": "livy_server_status", + "label": "Spark Livy Server", + "description": "This host-level alert is triggered if the Livy Server cannot be determined to be up.", + "interval": 1, + "scope": "HOST", + "source": { + "type": "SCRIPT", + "path": "SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py", + "parameters": [ + { + "name": "check.command.timeout", + "display_name": "Command Timeout", + "value": 60.0, + "type": "NUMERIC", + "description": "The maximum time before check command will be killed by timeout", + "units": "seconds", + "threshold": "CRITICAL" + } + ] + } + } ] } } http://git-wip-us.apache.org/repos/asf/ambari/blob/7a0c9e46/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py new file mode 100644 index 0000000..7396440 --- /dev/null +++ b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py @@ -0,0 +1,146 @@ +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import time +import logging +import traceback +import socket +from resource_management import * +from resource_management.libraries.functions import format +from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl +from resource_management.libraries.script.script import Script +from resource_management.core.resources import Execute +from resource_management.core.logger import Logger +from resource_management.core import global_lock +from resource_management.libraries.functions import get_kinit_path + + +OK_MESSAGE = "TCP OK - {0:.3f}s response on port {1}" +CRITICAL_MESSAGE = "Connection failed on host {0}:{1} ({2})" + +logger = logging.getLogger('ambari_alerts') + +LIVY_SERVER_PORT_KEY = '{{livy-conf/livy.server.port}}' + +LIVYUSER_DEFAULT = 'livy' + +CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout' +CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0 + +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' +SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' +SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}' +SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' + +# The configured Kerberos executable search paths, if any +KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}' + + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (LIVY_SERVER_PORT_KEY,LIVYUSER_DEFAULT,SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY,SMOKEUSER_KEY) + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def execute(configurations={}, parameters={}, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if configurations is None: + return ('UNKNOWN', ['There were no configurations supplied to the script.']) + + LIVY_PORT_DEFAULT = 8998 + + port = LIVY_PORT_DEFAULT + if LIVY_SERVER_PORT_KEY in configurations: + port = int(configurations[LIVY_SERVER_PORT_KEY]) + + if host_name is None: + host_name = socket.getfqdn() + + livyuser = LIVYUSER_DEFAULT + + security_enabled = False + if SECURITY_ENABLED_KEY in configurations: + security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + + smokeuser_kerberos_keytab = None + if SMOKEUSER_KEYTAB_KEY in configurations: + smokeuser_kerberos_keytab = configurations[SMOKEUSER_KEYTAB_KEY] + + if host_name is None: + host_name = socket.getfqdn() + + smokeuser_principal = None + if SMOKEUSER_PRINCIPAL_KEY in configurations: + smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] + smokeuser_principal = smokeuser_principal.replace('_HOST',host_name.lower()) + + # Get the configured Kerberos executable search paths, if any + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + else: + kerberos_executable_search_paths = None + + kinit_path_local = get_kinit_path(kerberos_executable_search_paths) + + if security_enabled: + kinitcmd = format("{kinit_path_local} -kt {smokeuser_kerberos_keytab} {smokeuser_principal}; ") + # prevent concurrent kinit + kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) + kinit_lock.acquire() + try: + Execute(kinitcmd, user=livyuser) + finally: + kinit_lock.release() + + result_code = None + try: + start_time = time.time() + try: + livy_livyserver_host = str(host_name) + + livy_cmd = format("curl -s -o /dev/null -w'%{{http_code}}' --negotiate -u: -k http://{livy_livyserver_host}:{port}/sessions | grep 200 ") + + Execute(livy_cmd, + tries=3, + try_sleep=1, + logoutput=True, + user=livyuser + ) + + total_time = time.time() - start_time + result_code = 'OK' + label = OK_MESSAGE.format(total_time, port) + except: + result_code = 'CRITICAL' + label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc()) + except: + label = traceback.format_exc() + result_code = 'UNKNOWN' + + return (result_code, [label]) http://git-wip-us.apache.org/repos/asf/ambari/blob/7a0c9e46/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json index dc9d023..2e03f13 100755 --- a/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json @@ -27,6 +27,30 @@ } } } + ], + "LIVY2_SERVER": [ + { + "name": "livy2_server_status", + "label": "Spark2 Livy Server", + "description": "This host-level alert is triggered if the Livy2 Server cannot be determined to be up.", + "interval": 1, + "scope": "HOST", + "source": { + "type": "SCRIPT", + "path": "SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py", + "parameters": [ + { + "name": "check.command.timeout", + "display_name": "Command Timeout", + "value": 60.0, + "type": "NUMERIC", + "description": "The maximum time before check command will be killed by timeout", + "units": "seconds", + "threshold": "CRITICAL" + } + ] + } + } ] } } http://git-wip-us.apache.org/repos/asf/ambari/blob/7a0c9e46/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py new file mode 100644 index 0000000..44c284f --- /dev/null +++ b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py @@ -0,0 +1,146 @@ +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import time +import logging +import traceback +import socket +from resource_management import * +from resource_management.libraries.functions import format +from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl +from resource_management.libraries.script.script import Script +from resource_management.core.resources import Execute +from resource_management.core.logger import Logger +from resource_management.core import global_lock +from resource_management.libraries.functions import get_kinit_path + + +OK_MESSAGE = "TCP OK - {0:.3f}s response on port {1}" +CRITICAL_MESSAGE = "Connection failed on host {0}:{1} ({2})" + +logger = logging.getLogger('ambari_alerts') + +LIVY_SERVER_PORT_KEY = '{{livy2-conf/livy.server.port}}' + +LIVYUSER_DEFAULT = 'livy' + +CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout' +CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0 + +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' +SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' +SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}' +SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' + +# The configured Kerberos executable search paths, if any +KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}' + + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (LIVY_SERVER_PORT_KEY,LIVYUSER_DEFAULT,SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY,SMOKEUSER_KEY) + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def execute(configurations={}, parameters={}, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if configurations is None: + return ('UNKNOWN', ['There were no configurations supplied to the script.']) + + LIVY_PORT_DEFAULT = 8999 + + port = LIVY_PORT_DEFAULT + if LIVY_SERVER_PORT_KEY in configurations: + port = int(configurations[LIVY_SERVER_PORT_KEY]) + + if host_name is None: + host_name = socket.getfqdn() + + livyuser = LIVYUSER_DEFAULT + + security_enabled = False + if SECURITY_ENABLED_KEY in configurations: + security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + + smokeuser_kerberos_keytab = None + if SMOKEUSER_KEYTAB_KEY in configurations: + smokeuser_kerberos_keytab = configurations[SMOKEUSER_KEYTAB_KEY] + + if host_name is None: + host_name = socket.getfqdn() + + smokeuser_principal = None + if SMOKEUSER_PRINCIPAL_KEY in configurations: + smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] + smokeuser_principal = smokeuser_principal.replace('_HOST',host_name.lower()) + + # Get the configured Kerberos executable search paths, if any + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + else: + kerberos_executable_search_paths = None + + kinit_path_local = get_kinit_path(kerberos_executable_search_paths) + + if security_enabled: + kinitcmd = format("{kinit_path_local} -kt {smokeuser_kerberos_keytab} {smokeuser_principal}; ") + # prevent concurrent kinit + kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) + kinit_lock.acquire() + try: + Execute(kinitcmd, user=livyuser) + finally: + kinit_lock.release() + + result_code = None + try: + start_time = time.time() + try: + livy2_livyserver_host = str(host_name) + + livy_cmd = format("curl -s -o /dev/null -w'%{{http_code}}' --negotiate -u: -k http://{livy2_livyserver_host}:{port}/sessions | grep 200 ") + + Execute(livy_cmd, + tries=3, + try_sleep=1, + logoutput=True, + user=livyuser + ) + + total_time = time.time() - start_time + result_code = 'OK' + label = OK_MESSAGE.format(total_time, port) + except: + result_code = 'CRITICAL' + label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc()) + except: + label = traceback.format_exc() + result_code = 'UNKNOWN' + + return (result_code, [label])