AMBARI-20565. Ambari Agent Alert to detect when 'hdp-select versions' reports an error (alejandro)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/f18fad36 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/f18fad36 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/f18fad36 Branch: refs/heads/branch-dev-logsearch Commit: f18fad36821e148b69138ec2be6b5cd70ba207cc Parents: e3d9ff6 Author: Alejandro Fernandez <afernan...@hortonworks.com> Authored: Fri Mar 24 12:24:18 2017 -0700 Committer: Alejandro Fernandez <afernan...@hortonworks.com> Committed: Tue Mar 28 17:26:15 2017 -0700 ---------------------------------------------------------------------- .../libraries/functions/stack_select.py | 16 ++- .../server/checks/AtlasPresenceCheck.java | 4 +- ambari-server/src/main/resources/alerts.json | 12 +++ .../host_scripts/alert_version_select.py | 104 +++++++++++++++++++ .../server/api/services/AmbariMetaInfoTest.java | 14 +-- .../metadata/AgentAlertDefinitionsTest.java | 2 +- 6 files changed, 141 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/f18fad36/ambari-common/src/main/python/resource_management/libraries/functions/stack_select.py ---------------------------------------------------------------------- diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/stack_select.py b/ambari-common/src/main/python/resource_management/libraries/functions/stack_select.py index 20b4cbd..79393b9 100644 --- a/ambari-common/src/main/python/resource_management/libraries/functions/stack_select.py +++ b/ambari-common/src/main/python/resource_management/libraries/functions/stack_select.py @@ -290,11 +290,25 @@ def _get_upgrade_stack(): return None +def unsafe_get_stack_versions(): + """ + Gets list of stack versions installed on the host. + By default a call to <stack-selector-tool> versions is made to get the list of installed stack versions. + DO NOT use a fall-back since this function is called by alerts in order to find potential errors. + :return: Returns a tuple of (exit code, output, list of installed stack versions). + """ + stack_selector_path = stack_tools.get_stack_tool_path(stack_tools.STACK_SELECTOR_NAME) + code, out = call((STACK_SELECT_PREFIX, stack_selector_path, 'versions')) + versions = [] + if 0 == code: + for line in out.splitlines(): + versions.append(line.rstrip('\n')) + return (code, out, versions) def get_stack_versions(stack_root): """ Gets list of stack versions installed on the host. - Be default a call to <stack-selector-tool> versions is made to get the list of installed stack versions. + By default a call to <stack-selector-tool> versions is made to get the list of installed stack versions. As a fallback list of installed versions is collected from stack version directories in stack install root. :param stack_root: Stack install root :return: Returns list of installed stack versions. http://git-wip-us.apache.org/repos/asf/ambari/blob/f18fad36/ambari-server/src/main/java/org/apache/ambari/server/checks/AtlasPresenceCheck.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/checks/AtlasPresenceCheck.java b/ambari-server/src/main/java/org/apache/ambari/server/checks/AtlasPresenceCheck.java index 8556436..04b73fa 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/checks/AtlasPresenceCheck.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/checks/AtlasPresenceCheck.java @@ -29,8 +29,8 @@ import org.slf4j.LoggerFactory; import com.google.inject.Singleton; /** - * Checks if Atlas service is present. Upgrade to stack HDP 2.5 can't pursuit - * with existed on the cluster Atlas service. + * Checks if Atlas service is present. Upgrade to stack HDP 2.5 from previous stack + * must first delete Atlas from the cluster. */ @Singleton @UpgradeCheck(group = UpgradeCheckGroup.DEFAULT) http://git-wip-us.apache.org/repos/asf/ambari/blob/f18fad36/ambari-server/src/main/resources/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/alerts.json b/ambari-server/src/main/resources/alerts.json index 2559b3a..d646401 100644 --- a/ambari-server/src/main/resources/alerts.json +++ b/ambari-server/src/main/resources/alerts.json @@ -179,6 +179,18 @@ } ] } + }, + { + "name": "ambari_agent_version_select", + "label": "Ambari Agent Distro/Conf Select Versions", + "description": "This host-level alert is triggered if the distro selector such as hdp-select cannot calculate versions available on this host. This may indicate that /usr/$stack/ directory has links/dirs that do not belong inside of it.", + "interval": 5, + "scope": "HOST", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "alert_version_select.py" + } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/f18fad36/ambari-server/src/main/resources/host_scripts/alert_version_select.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/host_scripts/alert_version_select.py b/ambari-server/src/main/resources/host_scripts/alert_version_select.py new file mode 100644 index 0000000..118911f --- /dev/null +++ b/ambari-server/src/main/resources/host_scripts/alert_version_select.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +import logging +import socket +import json + +from resource_management.libraries.script.script import Script +from resource_management.libraries.functions.stack_select import unsafe_get_stack_versions + +RESULT_STATE_OK = 'OK' +RESULT_STATE_WARNING = 'WARNING' +RESULT_STATE_CRITICAL = 'CRITICAL' +RESULT_STATE_UNKNOWN = 'UNKNOWN' + +STACK_TOOLS = '{{cluster-env/stack_tools}}' + + +logger = logging.getLogger() + + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (STACK_TOOLS,) + + +def execute(configurations={}, parameters={}, host_name=None): + """ + Checks if the stack selector such as hdp-select can find versions installed on this host. E.g., + hdp-select versions + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value + host_name (string): the name of this host where the alert is running + """ + msg = [] + try: + if configurations is None: + return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) + + # Check required properties + if STACK_TOOLS not in configurations: + return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(STACK_TOOLS)]) + + # Of the form, + # { "stack_selector": ["hdp-select", "/usr/bin/hdp-select", "hdp-select"], "conf_selector": ["conf-select", "/usr/bin/conf-select", "conf-select"] } + stack_tools_str = configurations[STACK_TOOLS] + + if stack_tools_str is None: + return (RESULT_STATE_UNKNOWN, ['{} is a required parameter for the script and the value is null'.format(STACK_TOOLS)]) + + distro_select = "unknown-distro-select" + try: + stack_tools = json.loads(stack_tools_str) + distro_select = stack_tools["stack_selector"][0] + except: + pass + + # This may not exist if the host does not contain any stack components, + # or only contains components like Ambari Metrics and SmartSense + stack_root_dir = Script.get_stack_root() + + if os.path.isdir(stack_root_dir): + (code, out, versions) = unsafe_get_stack_versions() + + if code == 0: + msg.append("Ok. {}".format(distro_select)) + if versions is not None and type(versions) is list and len(versions) > 0: + msg.append("Versions: {}".format(", ".join(versions))) + return (RESULT_STATE_OK, ["\n".join(msg)]) + else: + msg.append("Failed, check dir {} for unexpected contents.".format(stack_root_dir)) + if out is not None: + msg.append(out) + + return (RESULT_STATE_CRITICAL, ["\n".join(msg)]) + else: + msg.append("Ok. No stack root {} to check.".format(stack_root_dir)) + return (RESULT_STATE_OK, ["\n".join(msg)]) + except Exception, e: + return (RESULT_STATE_CRITICAL, [e.message]) http://git-wip-us.apache.org/repos/asf/ambari/blob/f18fad36/ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java b/ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java index f1af66f..9ff7def 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java @@ -1935,7 +1935,7 @@ public class AmbariMetaInfoTest { AlertDefinitionDAO dao = injector.getInstance(AlertDefinitionDAO.class); List<AlertDefinitionEntity> definitions = dao.findAll(clusterId); - assertEquals(11, definitions.size()); + assertEquals(12, definitions.size()); // figure out how many of these alerts were merged into from the // non-stack alerts.json @@ -1947,7 +1947,7 @@ public class AmbariMetaInfoTest { } } - assertEquals(2, hostAlertCount); + assertEquals(3, hostAlertCount); assertEquals(9, definitions.size() - hostAlertCount); for (AlertDefinitionEntity definition : definitions) { @@ -1958,7 +1958,7 @@ public class AmbariMetaInfoTest { metaInfo.reconcileAlertDefinitions(clusters); definitions = dao.findAll(); - assertEquals(11, definitions.size()); + assertEquals(12, definitions.size()); for (AlertDefinitionEntity definition : definitions) { assertEquals(28, definition.getScheduleInterval().intValue()); @@ -1967,7 +1967,7 @@ public class AmbariMetaInfoTest { // find all enabled for the cluster should find 6 (the ones from HDFS; // it will not find the agent alert since it's not bound to the cluster) definitions = dao.findAllEnabled(cluster.getClusterId()); - assertEquals(10, definitions.size()); + assertEquals(11, definitions.size()); // create new definition AlertDefinitionEntity entity = new AlertDefinitionEntity(); @@ -1986,19 +1986,19 @@ public class AmbariMetaInfoTest { // verify the new definition is found (6 HDFS + 1 new one) definitions = dao.findAllEnabled(cluster.getClusterId()); - assertEquals(11, definitions.size()); + assertEquals(12, definitions.size()); // reconcile, which should disable our bad definition metaInfo.reconcileAlertDefinitions(clusters); // find all enabled for the cluster should find 6 definitions = dao.findAllEnabled(cluster.getClusterId()); - assertEquals(10, definitions.size()); + assertEquals(11, definitions.size()); // find all should find 6 HDFS + 1 disabled + 1 agent alert + 2 server // alerts definitions = dao.findAll(); - assertEquals(12, definitions.size()); + assertEquals(13, definitions.size()); entity = dao.findById(entity.getDefinitionId()); assertFalse(entity.getEnabled()); http://git-wip-us.apache.org/repos/asf/ambari/blob/f18fad36/ambari-server/src/test/java/org/apache/ambari/server/metadata/AgentAlertDefinitionsTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/metadata/AgentAlertDefinitionsTest.java b/ambari-server/src/test/java/org/apache/ambari/server/metadata/AgentAlertDefinitionsTest.java index cbc5e69..7378b8c 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/metadata/AgentAlertDefinitionsTest.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/metadata/AgentAlertDefinitionsTest.java @@ -60,7 +60,7 @@ public class AgentAlertDefinitionsTest { public void testLoadingAgentHostAlerts() { AmbariServiceAlertDefinitions ambariServiceAlertDefinitions = m_injector.getInstance(AmbariServiceAlertDefinitions.class); List<AlertDefinition> definitions = ambariServiceAlertDefinitions.getAgentDefinitions(); - Assert.assertEquals(2, definitions.size()); + Assert.assertEquals(3, definitions.size()); for( AlertDefinition definition : definitions){ Assert.assertEquals(Components.AMBARI_AGENT.name(),