Ori.livneh has submitted this change and it was merged. Change subject: Add an Icinga check for Graphite metric freshness ......................................................................
Add an Icinga check for Graphite metric freshness Checks a Graphite metric and generates WARNING or CRITICAL states if the most recent datapoint is older than the required freshness threshold. Change-Id: I2dcb8b08f2aa70108ead46c9510cfcb521abe158 --- A modules/monitoring/manifests/graphite_freshness.pp A modules/nagios_common/files/check_commands/check_graphite_freshness A modules/nagios_common/files/check_commands/check_graphite_freshness.cfg M modules/nagios_common/manifests/commands.pp 4 files changed, 141 insertions(+), 0 deletions(-) Approvals: Ori.livneh: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/monitoring/manifests/graphite_freshness.pp b/modules/monitoring/manifests/graphite_freshness.pp new file mode 100644 index 0000000..60baa43 --- /dev/null +++ b/modules/monitoring/manifests/graphite_freshness.pp @@ -0,0 +1,53 @@ +# == Define: monitoring::graphite_freshness +# +# Provisions an Icinga check that ensures a Graphite metric is 'fresh': +# that is, continuing to receive updates. +# +# === Parameters +# +# [*metric*] +# Graphite metric name. For example: 'reqstats.500'. +# Defaults to the resource title. +# +# [*warning*] +# Warn if most recent datapoint is older than this value. +# Value suffix may be one of 's', 'm', 'h' or 'd' for seconds, +# minutes, hours, or days, respectively. +# +# [*critical*] +# Crit if most recent datapoint is older than this value. +# Value suffix may be one of 's', 'm', 'h' or 'd' for seconds, +# minutes, hours, or days, respectively. +# +# [*graphite_url*] +# URL of Graphite's render API endpoint. +# Defaults to 'https://graphite.wikimedia.org/render'. +# +# [*contact_group*] +# Icinga contact group that should receive alerts. +# Defaults to 'admins'. +# +# === Examples +# +# # Emit a warning if most recent datapoint for metric 'reqerror.500' +# # is older than 5 minutes, and a critical alert if older than 10. +# monitoring::graphite_freshness { 'reqerror.500': +# warning => '5m', +# critical => '10m', +# } +# +define monitoring::graphite_freshness( + $warning, + $critical, + $metric = $title, + $ensure = present, + $graphite_url = 'https://graphite.wikimedia.org/render', + $contact_group = 'admins' +) { + monitoring::service { $title: + ensure => $ensure, + description => "'${metric}' Graphite freshness", + check_command => "check_graphite_freshness!${metric}!${graphite_url}!${warning}!${critical}", + contact_group => $contact_group, + } +} diff --git a/modules/nagios_common/files/check_commands/check_graphite_freshness b/modules/nagios_common/files/check_commands/check_graphite_freshness new file mode 100755 index 0000000..bc6fb7e --- /dev/null +++ b/modules/nagios_common/files/check_commands/check_graphite_freshness @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + check_graphite_freshness + ~~~~~~~~~~~~~~~~~~~~~~~~ + + Checks a Graphite metric and generates WARNING or CRITICAL states if + the most recent datapoint is older than the required freshness threshold. + + Usage: + check_graphite_freshness [-w THRESHOLD] [-c THRESHOLD] METRIC RENDER_URL + + Positional arguments: + METRIC metric name + RENDER_URL URL of graphite's render API + + Optional arguments: + -w THRESHOLD, --warning THRESHOLD warn if most recent datapoint + is older than this value + -c THRESHOLD, --critical THRESHOLD alert if most recent datapoint + is older than this value + +""" +from __future__ import print_function + +import sys +reload(sys) +sys.setdefaultencoding("utf-8") + +import argparse +import datetime +import json +import time +import urllib2 + + +def time_spec(spec_string): + """Parse a time specification string consisting of a number + followed by an optional letter specifying the unit.""" + units = {'s': 'seconds', 'm': 'minutes', 'h': 'hours', 'd': 'days'} + if spec_string[-1].isalpha(): + unit = units[spec_string[-1]] + count = int(spec_string[:-1]) + else: + unit = 'seconds' + count = int(spec_string) + return datetime.timedelta(**{unit: count}) + + +ap = argparse.ArgumentParser(description='Graphite staleness alert') +ap.add_argument('metric', help='metric name') +ap.add_argument('render_url', help="URL of graphite's render API") +ap.add_argument('-w', '--warning', type=time_spec, metavar='THRESHOLD', + help='warn if most recent datapoint is older than this value') +ap.add_argument('-c', '--critical', type=time_spec, metavar='THRESHOLD', + help='alert if most recent datapoint is older than this value') +args = ap.parse_args() +if args.critical is None and args.warning is None: + ap.error('You must specify one (or both) of -w/--warning or -c/--critical') + + +try: + url = args.render_url + '?format=json&target=' + args.metric + data = json.load(urllib2.urlopen(url))[0] + most_recent = datetime.datetime.utcfromtimestamp(max( + ts for value, ts in data['datapoints'] if value is not None)) + staleness = datetime.datetime.utcnow() - most_recent +except Exception as e: + print('UNKNOWN: failed to check %s' % args.metric) + raise + sys.exit(3) + +if args.critical and staleness > args.critical: + print('CRITICAL: %s is %d seconds stale.' % ( + args.metric, staleness.total_seconds()), file=sys.stderr) + sys.exit(2) +elif args.warning and staleness > args.warning: + print('WARNING: %s is %d seconds stale.' % ( + args.metric, staleness.total_seconds()), file=sys.stderr) + sys.exit(1) +else: + print('OK: %s is fresh.' % args.metric, file=sys.stderr) + sys.exit(1) diff --git a/modules/nagios_common/files/check_commands/check_graphite_freshness.cfg b/modules/nagios_common/files/check_commands/check_graphite_freshness.cfg new file mode 100644 index 0000000..9c7ac64 --- /dev/null +++ b/modules/nagios_common/files/check_commands/check_graphite_freshness.cfg @@ -0,0 +1,4 @@ +define command{ + command_name check_graphite_freshness + command_line $USER1$/check_graphite_freshness $ARG1$ $ARG2$ -w $ARG4$ -c $ARG5$ +} diff --git a/modules/nagios_common/manifests/commands.pp b/modules/nagios_common/manifests/commands.pp index 0d4a056..5e24a78 100644 --- a/modules/nagios_common/manifests/commands.pp +++ b/modules/nagios_common/manifests/commands.pp @@ -45,6 +45,7 @@ 'check_bgp', 'check_dsh_groups', 'check_graphite', + 'check_graphite_freshness', 'check_ifstatus_nomon', 'check_jnx_alarms', 'check_ores_workers', -- To view, visit https://gerrit.wikimedia.org/r/251675 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I2dcb8b08f2aa70108ead46c9510cfcb521abe158 Gerrit-PatchSet: 7 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ori.livneh <o...@wikimedia.org> Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org> Gerrit-Reviewer: Ori.livneh <o...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits