Repository: ambari
Updated Branches:
  refs/heads/trunk de3667a21 -> 80afc9f02


AMBARI-10061 Alert Failures on Windows (echekanskiy via fbarca)

Alerts for oozie, storm and ams are broken on windows os.


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/80afc9f0
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/80afc9f0
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/80afc9f0

Branch: refs/heads/trunk
Commit: 80afc9f027244eb17cd32c778f47e0bfd2398bf0
Parents: de3667a
Author: Florian Barca <fba...@hortonworks.com>
Authored: Tue Mar 17 03:25:57 2015 -0700
Committer: Florian Barca <fba...@hortonworks.com>
Committed: Tue Mar 17 03:25:57 2015 -0700

----------------------------------------------------------------------
 .../alerts/alert_ambari_metrics_monitor.py      |  18 ++
 .../configuration/falcon-startup.properties.xml |  26 ---
 .../package/alerts/alert_check_oozie_server.py  | 105 ++++++-----
 .../HDPWIN/2.1/services/STORM/alerts.json       | 174 +++++++++++++++++++
 .../package/alerts/check_supervisor_process.py  |  49 ++++++
 5 files changed, 304 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py
 
b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py
index 3e87e25..04a2e01 100644
--- 
a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py
+++ 
b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py
@@ -23,7 +23,11 @@ import socket
 
 from resource_management.libraries.functions.check_process_status import 
check_process_status
 from resource_management.core.exceptions import ComponentIsNotRunning
+from ambari_commons import OSCheck, OSConst
+from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
 
+if OSCheck.is_windows_family():
+  from resource_management.libraries.functions.windows_service_utils import 
check_windows_service_status
 RESULT_CODE_OK = 'OK'
 RESULT_CODE_CRITICAL = 'CRITICAL'
 RESULT_CODE_UNKNOWN = 'UNKNOWN'
@@ -37,7 +41,21 @@ def get_tokens():
   """
   return (AMS_MONITOR_PID_DIR,)
 
+@OsFamilyFuncImpl(OSConst.WINSRV_FAMILY)
+def is_monitor_process_live(pid_file=None):
+  """
+  Gets whether the Metrics Monitor Service is running.
+  :param pid_file: ignored
+  :return: True if the monitor is running, False otherwise
+  """
+  try:
+    check_windows_service_status("AmbariMetricsHostMonitoring")
+    ams_monitor_process_running = True
+  except:
+    ams_monitor_process_running = False
+  return ams_monitor_process_running
 
+@OsFamilyFuncImpl(OsFamilyImpl.DEFAULT)
 def is_monitor_process_live(pid_file):
   """
   Gets whether the Metrics Monitor represented by the specified file is 
running.

http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml
 
b/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml
index 252fed4..6a35c17 100644
--- 
a/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml
+++ 
b/ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/configuration/falcon-startup.properties.xml
@@ -183,30 +183,4 @@
     <value>DEFAULT</value>
     <description>The kerberos names rules is to resolve kerberos principal 
names, refer to Hadoop's KerberosName for more details.</description>
   </property>
-  <!--kerberos params, must be set during security enabling-->
-  <property>
-    <name>*.falcon.service.authentication.kerberos.principal</name>
-    <value>falcon/_h...@example.com</value>
-    <description></description>
-  </property>
-  <property>
-    <name>*.falcon.service.authentication.kerberos.keytab</name>
-    <value>/etc/security/keytabs/falcon.service.keytab</value>
-    <description></description>
-  </property>
-  <property>
-    <name>*.dfs.namenode.kerberos.principal</name>
-    <value>nn/_h...@example.com</value>
-    <description>name node principal to talk to config store</description>
-  </property>
-  <property>
-    <name>*.falcon.http.authentication.kerberos.principal</name>
-    <value>HTTP/_h...@example.com</value>
-    <description>Indicates the Kerberos principal to be used for HTTP 
endpoint</description>
-  </property>
-  <property>
-    <name>*.falcon.http.authentication.kerberos.keytab</name>
-    <value>/etc/security/keytabs/spnego.service.keytab</value>
-    <description>Location of the keytab file with the credentials for the HTTP 
principal</description>
-  </property>
 </configuration>

http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py
 
b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py
index 9e2775b..9e65e6b 100644
--- 
a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py
+++ 
b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py
@@ -24,7 +24,8 @@ from resource_management.libraries.functions import format
 from resource_management.libraries.functions import get_kinit_path
 from resource_management.libraries.functions import get_klist_path
 from ambari_commons.os_check import OSConst, OSCheck
-from os import getpid, sep
+from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
+import os
 from urlparse import urlparse
 
 RESULT_CODE_OK = 'OK'
@@ -36,6 +37,17 @@ SECURITY_ENABLED = '{{cluster-env/security_enabled}}'
 OOZIE_PRINCIPAL = '{{oozie-site/oozie.authentication.kerberos.principal}}'
 OOZIE_KEYTAB = '{{oozie-site/oozie.authentication.kerberos.keytab}}'
 
+class KerberosPropertiesNotFound(Exception): pass
+
+@OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY)
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (OOZIE_URL_KEY,)
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -43,6 +55,52 @@ def get_tokens():
   """
   return (OOZIE_URL_KEY, OOZIE_PRINCIPAL, SECURITY_ENABLED, OOZIE_KEYTAB)
 
+@OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY)
+def get_check_command(oozie_url, host_name, parameters):
+  from resource_management.libraries.functions import reload_windows_env
+  reload_windows_env()
+  oozie_home = os.environ['OOZIE_HOME']
+  command = format("{oozie_home}\\bin\\oozie.cmd admin -oozie {oozie_url} 
-status")
+  return (command, None)
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def get_check_command(oozie_url, host_name, parameters):
+  security_enabled = False
+  if SECURITY_ENABLED in parameters:
+    security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE'
+  kerberos_env = None
+  if security_enabled:
+    if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters:
+      oozie_keytab = parameters[OOZIE_KEYTAB]
+      oozie_principal = parameters[OOZIE_PRINCIPAL]
+
+      # substitute _HOST in kerberos principal with actual fqdn
+      oozie_principal = oozie_principal.replace('_HOST', host_name)
+    else:
+      raise KerberosPropertiesNotFound('The Oozie keytab and principal are 
required parameters when security is enabled.')
+
+    # Create the kerberos credentials cache (ccache) file and set it in the 
environment to use
+    # when executing curl
+    env = Environment.get_instance()
+    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, 
os.getpid())
+    kerberos_env = {'KRB5CCNAME': ccache_file}
+
+    klist_path_local = get_klist_path()
+    klist_command = format("{klist_path_local} -s {ccache_file}")
+
+    # Determine if we need to kinit by testing to see if the relevant cache 
exists and has
+    # non-expired tickets.  Tickets are marked to expire after 5 minutes to 
help reduce the number
+    # it kinits we do but recover quickly when keytabs are regenerated
+    return_code, _ = call(klist_command)
+    if return_code != 0:
+      kinit_path_local = get_kinit_path()
+      kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} 
{oozie_principal}; ")
+
+      # kinit
+      Execute(kinit_command, environment=kerberos_env)
+  command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie 
{oozie_url} -status")
+  return (command, kerberos_env)
+
 def execute(parameters=None, host_name=None):
   """
   Returns a tuple containing the result code and a pre-formatted result label
@@ -65,50 +123,13 @@ def execute(parameters=None, host_name=None):
   oozie_url = parameters[OOZIE_URL_KEY]
   oozie_url = oozie_url.replace(urlparse(oozie_url).hostname,localhost_address)
 
-  security_enabled = False
-  if SECURITY_ENABLED in parameters:
-    security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE'
-
-  command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie 
{oozie_url} -status")
-
   try:
-    # kinit if security is enabled so that oozie-env.sh can make the web 
request
-    kerberos_env = None
-
-    if security_enabled:
-      if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters:
-        oozie_keytab = parameters[OOZIE_KEYTAB]
-        oozie_principal = parameters[OOZIE_PRINCIPAL]
-
-        # substitute _HOST in kerberos principal with actual fqdn
-        oozie_principal = oozie_principal.replace('_HOST', host_name)
-      else:
-        return (RESULT_CODE_UNKNOWN, ['The Oozie keytab and principal are 
required parameters when security is enabled.'])
-
-      # Create the kerberos credentials cache (ccache) file and set it in the 
environment to use
-      # when executing curl
-      env = Environment.get_instance()
-      ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, sep, 
getpid())
-      kerberos_env = {'KRB5CCNAME': ccache_file}
-
-      klist_path_local = get_klist_path()
-      klist_command = format("{klist_path_local} -s {ccache_file}")
-
-      # Determine if we need to kinit by testing to see if the relevant cache 
exists and has
-      # non-expired tickets.  Tickets are marked to expire after 5 minutes to 
help reduce the number
-      # it kinits we do but recover quickly when keytabs are regenerated
-      return_code, _ = call(klist_command)
-      if return_code != 0:
-        kinit_path_local = get_kinit_path()
-        kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} 
{oozie_principal}; ")
-
-        # kinit
-        Execute(kinit_command, environment=kerberos_env)
-
+    command, env = get_check_command(oozie_url, host_name, parameters)
     # execute the command
-    Execute(command, environment=kerberos_env)
+    Execute(command, environment=env)
 
     return (RESULT_CODE_OK, ["Successful connection to {0}".format(oozie_url)])
-
+  except KerberosPropertiesNotFound, ex:
+    return (RESULT_CODE_UNKNOWN, [str(ex)])
   except Exception, ex:
     return (RESULT_CODE_CRITICAL, [str(ex)])

http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json 
b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json
new file mode 100644
index 0000000..babf7cf
--- /dev/null
+++ 
b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/alerts.json
@@ -0,0 +1,174 @@
+{
+  "STORM": {
+    "service": [
+      {
+        "name": "storm_supervisor_process_percent",
+        "label": "Percent Supervisors Available",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "storm_supervisor_process",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.1
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.3
+            }
+          }
+        }
+      }
+    ],
+    "STORM_UI_SERVER": [
+      {
+        "name": "storm_server_process",
+        "label": "Storm Server Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "{{storm-site/ui.port}}",
+          "default_port": 8744,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.3f}s response on port {1}"
+            },
+            "warning": {
+              "text": "TCP OK - {0:.3f}s response on port {1}",
+              "value": 1.5
+            },
+            "critical": {
+              "text": "Connection failed: {0} to {1}:{2}",
+              "value": 5.0
+            }
+          }
+        }
+      },
+      {
+        "name": "storm_webui",
+        "label": "Storm Web UI",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{storm-site/ui.port}}"
+          },
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {2:.3f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {2:.3f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}"
+            }
+          }
+        }
+      }
+    ],
+    "NIMBUS": [
+      {
+        "name": "storm_nimbus_process",
+        "label": "Nimbus Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "{{storm-site/nimbus.thrift.port}}",
+          "default_port": 6627,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.3f}s response on port {1}"
+            },
+            "warning": {
+              "text": "TCP OK - {0:.3f}s response on port {1}",
+              "value": 1.5
+            },
+            "critical": {
+              "text": "Connection failed: {0} to {1}:{2}",
+              "value": 5.0
+            }
+          }
+        }
+      }
+    ],
+    "DRPC_SERVER": [
+      {
+        "name": "storm_drpc_server",
+        "label": "DRPC Server Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "{{storm-site/drpc.port}}",
+          "default_port": 3772,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.3f}s response on port {1}"
+            },
+            "warning": {
+              "text": "TCP OK - {0:.3f}s response on port {1}",
+              "value": 1.5
+            },
+            "critical": {
+              "text": "Connection failed: {0} to {1}:{2}",
+              "value": 5.0
+            }
+          }
+        }
+      }
+    ],
+    "STORM_REST_API": [
+      {
+        "name": "storm_rest_api",
+        "label": "Storm REST API",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "8745",
+          "default_port": 8745,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.3f}s response on port {1}"
+            },
+            "warning": {
+              "text": "TCP OK - {0:.3f}s response on port {1}",
+              "value": 1.5
+            },
+            "critical": {
+              "text": "Connection failed: {0} to {1}:{2}",
+              "value": 5.0
+            }
+          }
+        }
+      }
+    ],
+    "SUPERVISOR": [
+      {
+        "name": "storm_supervisor_process",
+        "label": "Supervisor Process",
+        "interval": 1,
+        "scope": "HOST",
+        "source": {
+          "type": "SCRIPT",
+          "path": 
"HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py"
+        }
+      }
+    ]
+  }
+}

http://git-wip-us.apache.org/repos/asf/ambari/blob/80afc9f0/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py
 
b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py
new file mode 100644
index 0000000..dcae64a
--- /dev/null
+++ 
b/ambari-server/src/main/resources/stacks/HDPWIN/2.1/services/STORM/package/alerts/check_supervisor_process.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from resource_management.libraries.functions import 
check_windows_service_status
+
+
+RESULT_CODE_OK = 'OK'
+RESULT_CODE_CRITICAL = 'CRITICAL'
+RESULT_CODE_UNKNOWN = 'UNKNOWN'
+
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return ()
+
+def execute(parameters=None, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  try:
+    check_windows_service_status("supervisor")
+    return (RESULT_CODE_OK, ["Supervisor is running"])
+  except:
+    return (RESULT_CODE_CRITICAL, ["Supervisor is stopped"])

Reply via email to