Ori.livneh has submitted this change and it was merged.

Change subject: Add an Icinga check for Graphite metric freshness
......................................................................


Add an Icinga check for Graphite metric freshness

Checks a Graphite metric and generates WARNING or CRITICAL states if the most
recent datapoint is older than the required freshness threshold.

Change-Id: I2dcb8b08f2aa70108ead46c9510cfcb521abe158
---
A modules/monitoring/manifests/graphite_freshness.pp
A modules/nagios_common/files/check_commands/check_graphite_freshness
A modules/nagios_common/files/check_commands/check_graphite_freshness.cfg
M modules/nagios_common/manifests/commands.pp
4 files changed, 141 insertions(+), 0 deletions(-)

Approvals:
  Ori.livneh: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/monitoring/manifests/graphite_freshness.pp 
b/modules/monitoring/manifests/graphite_freshness.pp
new file mode 100644
index 0000000..60baa43
--- /dev/null
+++ b/modules/monitoring/manifests/graphite_freshness.pp
@@ -0,0 +1,53 @@
+# == Define: monitoring::graphite_freshness
+#
+# Provisions an Icinga check that ensures a Graphite metric is 'fresh':
+# that is, continuing to receive updates.
+#
+# === Parameters
+#
+# [*metric*]
+#   Graphite metric name. For example: 'reqstats.500'.
+#   Defaults to the resource title.
+#
+# [*warning*]
+#   Warn if most recent datapoint is older than this value.
+#   Value suffix may be one of 's', 'm', 'h' or 'd' for seconds,
+#   minutes, hours, or days, respectively.
+#
+# [*critical*]
+#   Crit if most recent datapoint is older than this value.
+#   Value suffix may be one of 's', 'm', 'h' or 'd' for seconds,
+#   minutes, hours, or days, respectively.
+#
+# [*graphite_url*]
+#   URL of Graphite's render API endpoint.
+#   Defaults to 'https://graphite.wikimedia.org/render'.
+#
+# [*contact_group*]
+#   Icinga contact group that should receive alerts.
+#   Defaults to 'admins'.
+#
+# === Examples
+#
+#  # Emit a warning if most recent datapoint for metric 'reqerror.500'
+#  # is older than 5 minutes, and a critical alert if older than 10.
+#  monitoring::graphite_freshness { 'reqerror.500':
+#    warning  => '5m',
+#    critical => '10m',
+#  }
+#
+define monitoring::graphite_freshness(
+    $warning,
+    $critical,
+    $metric        = $title,
+    $ensure        = present,
+    $graphite_url  = 'https://graphite.wikimedia.org/render',
+    $contact_group = 'admins'
+) {
+    monitoring::service { $title:
+        ensure        => $ensure,
+        description   => "'${metric}' Graphite freshness",
+        check_command => 
"check_graphite_freshness!${metric}!${graphite_url}!${warning}!${critical}",
+        contact_group => $contact_group,
+    }
+}
diff --git 
a/modules/nagios_common/files/check_commands/check_graphite_freshness 
b/modules/nagios_common/files/check_commands/check_graphite_freshness
new file mode 100755
index 0000000..bc6fb7e
--- /dev/null
+++ b/modules/nagios_common/files/check_commands/check_graphite_freshness
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+  check_graphite_freshness
+  ~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Checks a Graphite metric and generates WARNING or CRITICAL states if
+  the most recent datapoint is older than the required freshness threshold.
+
+  Usage:
+    check_graphite_freshness [-w THRESHOLD] [-c THRESHOLD] METRIC RENDER_URL
+
+  Positional arguments:
+    METRIC                metric name
+    RENDER_URL            URL of graphite's render API
+
+  Optional arguments:
+    -w THRESHOLD, --warning THRESHOLD   warn if most recent datapoint
+                                        is older than this value
+    -c THRESHOLD, --critical THRESHOLD  alert if most recent datapoint
+                                        is older than this value
+
+"""
+from __future__ import print_function
+
+import sys
+reload(sys)
+sys.setdefaultencoding("utf-8")
+
+import argparse
+import datetime
+import json
+import time
+import urllib2
+
+
+def time_spec(spec_string):
+    """Parse a time specification string consisting of a number
+    followed by an optional letter specifying the unit."""
+    units = {'s': 'seconds', 'm': 'minutes', 'h': 'hours', 'd': 'days'}
+    if spec_string[-1].isalpha():
+        unit = units[spec_string[-1]]
+        count = int(spec_string[:-1])
+    else:
+        unit = 'seconds'
+        count = int(spec_string)
+    return datetime.timedelta(**{unit: count})
+
+
+ap = argparse.ArgumentParser(description='Graphite staleness alert')
+ap.add_argument('metric', help='metric name')
+ap.add_argument('render_url', help="URL of graphite's render API")
+ap.add_argument('-w', '--warning', type=time_spec, metavar='THRESHOLD',
+                help='warn if most recent datapoint is older than this value')
+ap.add_argument('-c', '--critical', type=time_spec, metavar='THRESHOLD',
+                help='alert if most recent datapoint is older than this value')
+args = ap.parse_args()
+if args.critical is None and args.warning is None:
+    ap.error('You must specify one (or both) of -w/--warning or -c/--critical')
+
+
+try:
+    url = args.render_url + '?format=json&target=' + args.metric
+    data = json.load(urllib2.urlopen(url))[0]
+    most_recent = datetime.datetime.utcfromtimestamp(max(
+            ts for value, ts in data['datapoints'] if value is not None))
+    staleness = datetime.datetime.utcnow() - most_recent
+except Exception as e:
+    print('UNKNOWN: failed to check %s' % args.metric)
+    raise
+    sys.exit(3)
+
+if args.critical and staleness > args.critical:
+    print('CRITICAL: %s is %d seconds stale.' % (
+        args.metric, staleness.total_seconds()), file=sys.stderr)
+    sys.exit(2)
+elif args.warning and staleness > args.warning:
+    print('WARNING: %s is %d seconds stale.' % (
+        args.metric, staleness.total_seconds()), file=sys.stderr)
+    sys.exit(1)
+else:
+    print('OK: %s is fresh.' % args.metric, file=sys.stderr)
+    sys.exit(1)
diff --git 
a/modules/nagios_common/files/check_commands/check_graphite_freshness.cfg 
b/modules/nagios_common/files/check_commands/check_graphite_freshness.cfg
new file mode 100644
index 0000000..9c7ac64
--- /dev/null
+++ b/modules/nagios_common/files/check_commands/check_graphite_freshness.cfg
@@ -0,0 +1,4 @@
+define command{
+    command_name    check_graphite_freshness
+    command_line    $USER1$/check_graphite_freshness $ARG1$ $ARG2$ -w $ARG4$ 
-c $ARG5$
+}
diff --git a/modules/nagios_common/manifests/commands.pp 
b/modules/nagios_common/manifests/commands.pp
index 0d4a056..5e24a78 100644
--- a/modules/nagios_common/manifests/commands.pp
+++ b/modules/nagios_common/manifests/commands.pp
@@ -45,6 +45,7 @@
         'check_bgp',
         'check_dsh_groups',
         'check_graphite',
+        'check_graphite_freshness',
         'check_ifstatus_nomon',
         'check_jnx_alarms',
         'check_ores_workers',

-- 
To view, visit https://gerrit.wikimedia.org/r/251675
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I2dcb8b08f2aa70108ead46c9510cfcb521abe158
Gerrit-PatchSet: 7
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ori.livneh <o...@wikimedia.org>
Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org>
Gerrit-Reviewer: Ori.livneh <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to