Giuseppe Lavagetto has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/125726

Change subject: Substituting the check_graphite script.
......................................................................

Substituting the check_graphite script.

As I was finding it extremely difficult to interact with the original
disquis plugin I decided to spend some time to refactor it.
It is now more modular and will be easier to use and to modify according
to our purposes.

Also changed the command in check_commands.erb accordingly.

Change-Id: Ief0c4e6e50eda99cf7f94d8c5674d5573a8fc895
Signed-off-by: Giuseppe Lavagetto <glavage...@wikimedia.org>
---
M files/icinga/check_graphite
M templates/icinga/checkcommands.cfg.erb
2 files changed, 292 insertions(+), 263 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/26/125726/1

diff --git a/files/icinga/check_graphite b/files/icinga/check_graphite
old mode 100644
new mode 100755
index bedb7c1..f81c8e0
--- a/files/icinga/check_graphite
+++ b/files/icinga/check_graphite
@@ -3,284 +3,313 @@
 check_graphite.py
 ~~~~~~~
 
-:copyright: (c) 2012 DISQUS.
+Based on the original plugin from disquis:
+    https://github.com/disqus/nagios-plugins
+
+:copyright: (c) 2014 Wikimedia Foundation
 :license: Apache License 2.0, see LICENSE for more details.
 """
 
+import os
 import json
-import optparse
-import urllib
-import urllib2
-import sys
-
+import urllib3
 from numbers import Real
+import argparse
+import ssl
+import sys
+from collections import defaultdict
 
-NAGIOS_STATUSES = {
-    'OK': 0,
-    'WARNING': 1,
-    'CRITICAL': 2,
-    'UNKNOWN': 3
+try:
+    #python 3.x compat
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
+
+
+class NagiosException(Exception):
+    NAGIOS_STATUSES = {
+        'OK': 0,
+        'WARNING': 1,
+        'CRITICAL': 2,
+        'UNKNOWN': 3
 }
 
-class Graphite(object):
+    def __init__(self, exitcode, msg):
+        self.exitcode = self.NAGIOS_STATUSES.get(exitcode, 3)
+        self.msg = "%s: %s" % (exitcode, msg)
 
-    def __init__(self, url, targets, _from, _until):
-        self.url = url.rstrip('/')
-        self.targets = targets
-        self._from = _from
-        self._until = _until
-        params = [('target', t) for t in self.targets] +\
-            [('from', self._from)] +\
-            [('until', self._until)] +\
-            [('format', 'json')]
-        self.full_url = self.url + '/render?' +\
-            urllib.urlencode(params)
 
-    def check_datapoints(self, datapoints, check_func, **kwargs):
-        """Find alerting datapoints
+    def exit(self):
+        sys.exit(self.exitcode)
 
-        Args:
-            datapoints (list): The list of datapoints to check
-
-        Kwargs:
-            check_func (function): The function to find out of bounds 
datapoints
-            bounds (list): Compare against `datapoints` to find out of bounds 
list
-            compare (list): Used for comparison if `datapoints` is out of 
bounds
-            threshold (float): `check_func` is called for each datapoint 
against `threshold`
-            beyond (float): Return datapoint if `beyond` value in bounds list 
(percentage).
-
-        Returns:
-            The list of out of bounds datapoints
-        """
-        if 'threshold' in kwargs:
-            return [x for x in datapoints if isinstance(x, Real) and 
check_func(x, kwargs['threshold'])]
-        elif 'bounds' in kwargs:
-            if 'compare' in kwargs:
-              return [datapoints[x] for x in xrange(len(datapoints)) if 
all([datapoints[x], kwargs['bounds'][x], kwargs['compare'][x]]) and 
check_func(datapoints[x] / kwargs['bounds'][x], kwargs['beyond']) and 
check_func(datapoints[x], kwargs['compare'][x])]
-            else:
-                return [datapoints[x] for x in xrange(len(datapoints)) if 
all([datapoints[x], kwargs['bounds'][x]]) and check_func(datapoints[x], 
kwargs['bounds'][x])]
-
-    def fetch_metrics(self):
-        try:
-            response = urllib2.urlopen(self.full_url)
-
-            if response.code != 200:
-                return None
-            else:
-                return json.loads(response.read())
-        except urllib2.URLError, TypeError:
-            return None
-
-    def generate_output(self, datapoints, *args, **kwargs):
-        """Generate check output
-
-        Args:
-            datapoints (list): The list of datapoints to check
-            warn_oob (list): Optional list of datapoints considered in warning 
state
-            crit_oob (list): Mandatory list of datapoints considered in 
warning state
-
-        Kwargs:
-            count (int): Number of metrics that would generate an alert
-            warning (float): The check's warning threshold
-            critical (float): The check's critical threshold
-            target (str): The target for `datapoints`
-
-        Returns:
-            A dictionary of datapoints grouped by their status ('CRITICAL', 
'WARNING', 'OK')
-        """
-        check_output = dict(OK=[], WARNING=[], CRITICAL=[])
-        count = kwargs['count']
-        warning = kwargs.get('warning', 0)
-        critical = kwargs.get('critical', 0)
-        target = kwargs.get('target', 'timeseries')
-
-        if len(args) > 1:
-            (warn_oob, crit_oob) = args
+class GraphiteCheck(object):
+    parser_name='check_generic'
+    """
+    Nothing to see here
+    """
+    def __init__(self, args):
+        self.targets = args.metric.split(',')
+        parsed_url = urlparse(args.url)
+        if parsed_url.netloc.find('@') > 0:
+            (self.credentials, host) = parsed_url.netloc.split('@')
         else:
-            crit_oob = [x for x in args[0] if isinstance(x, Real)]
-            warn_oob = []
+            host = parsed_url.netloc
+            self.credentials = None
+        self.base_url= "%s://%s" % (parsed_url.scheme, host)
+        #subclasses should just implement get_all here.
+        self.get_all(args)
 
-        if self.has_numbers(crit_oob) and len(crit_oob) >= count:
-            check_output['CRITICAL'].append('%s [crit=%f|datapoints=%s]' %\
-                (target, critical, ','.join(['%s' % str(x) for x in 
crit_oob])))
-        elif self.has_numbers(warn_oob) and len(warn_oob) >= count:
-            check_output['WARNING'].append('%s [warn=%f|datapoints=%s]' %\
-                (target, warning, ','.join(['%s' % str(x) for x in warn_oob])))
-        else:
-            check_output['OK'].append('%s [warn=%0.3f|crit=%f|datapoints=%s]' 
%\
-                (target, warning, critical, ','.join(['%s' % str(x) for x in 
datapoints])))
+        #Set up the http connectionpool
+        http_opts = {}
+        if args.timeout:
+            http_opts['timeout'] = args.timeout
+        if parsed_url.scheme == 'https':
+            http_opts['ssl_version'] = ssl.PROTOCOL_TLSv1
+            if args.ssl_certs:
+                # We expect a combined cert
+                http_opts['cert_file'] = args.ssl_certs
+            #TODO: verify SSL by default
 
-        return check_output
+        self.http = urllib3.PoolManager(num_pools=10, **http_opts)
 
-    def has_numbers(self, lst):
+
+    def get_all(self,args):
+        #This should be implemented in subclasses
+        raise NotImplementedError
+
+    def fetch(self):
+        h = {'user_agent': 'check_graphite/1.0'}
+
+        if self.credentials:
+            h['basic_auth'] = self.credentials
+
+        full_url = "%s/render" % self.base_url
         try:
-            return any([isinstance(x, Real) for x in lst])
-        except TypeError:
-            return False
+            response = self.http.request(
+                'GET',
+                full_url,
+                fields=self.params,
+                redirect=True,
+                headers=urllib3.util.make_headers(**h)
+            )
+        except urllib3.exceptions.MaxRetryError:
+            raise NagiosException('UNKNOWN', 'Could not reach the graphite 
server at %s' % full_url)
+
+        if response.status != 200:
+            raise NagiosException('UNKNOWN', 'Got status %d from the graphite 
server at %s' % (response.status, full_url))
+
+        return json.loads(response.data)
+
+    @classmethod
+    def create_parser(cls, parser):
+        p = parser.add_parser(cls.parser_name, help=cls.__doc__)
+        p.add_argument('metric', metavar='METRIC', help='the metric to fetch 
from graphite')
+        p.add_argument('-C', '--critical', dest='crit', type=int, 
help='Threshold for critical alert (integer)')
+        p.add_argument('-W', '--warning', dest='warn', type=int, 
help='Threshold for warning (integer)')
+        p.set_defaults(check_class=cls)
+        return p
+
+    def parse_result(self, result):
+        raise NotImplementedError
+
+    def check_data(self, datapoints):
+        raise NotImplementedError
+
+    def run(self):
+        res = self.fetch()
+        dp = self.parse_result(res)
+        self.check_data(dp)
+
+class Threshold(GraphiteCheck):
+    """
+    Checks if the metric exceeds the desired threshold
+    """
+    parser_name='check_threshold'
+
+    @classmethod
+    def create_parser(cls,parser):
+        p = super(Threshold,cls).create_parser(parser)
+        p.add_argument('--from', dest='_from', help='When to fetch the metric 
from (date or "-1d")', default='-1h')
+        p.add_argument('--over', dest="over", action='store_true', 
default=True, help='If alarms should happen when we exceed the threshold')
+        p.add_argument('--under', dest="under", action='store_true', 
default=False, help='If alarms should happen when we are below the threshold')
+        p.add_argument('--perc', dest="percentage", default=1, help='Number of 
datapoints above threshold that will raise the alarm')
+        return p
+
+    def get_all(self,args):
+        self.params = [('format', 'json'), ('from', args._from)]
+        for target in self.targets:
+            self.params.append(('target', target))
+        if args.under:
+            self.check_func = lambda x, y: x < y
+        else:
+            self.check_func = lambda x, y: x > y
+        self.limits ={}
+        self.limits['WARNING'] = args.warn
+        self.limits['CRITICAL'] = args.crit
+        self.perc = args.percentage
+
+
+    def parse_result(self, result):
+        #TODO: make this work for lists of results
+        datapoints = defaultdict(list)
+        datapoints['_total'] = 0
+        for (data,time) in result[0]['datapoints']:
+            if not isinstance(data, Real):
+                datapoints['UNKOWN'].append((time, data))
+                continue
+            elif self.check_func(data, self.limits['CRITICAL']):
+                datapoints['CRITICAL'].append((time, data))
+
+            elif self.check_func(data, self.limits['WARNING']):
+                datapoints['WARNING'].append((time,data))
+            else:
+                datapoints['OK'].append((time,data))
+            datapoints['_total'] += 1
+        return datapoints
+
+    def check_data(self,datapoints):
+        #TODO: make this work for lists of results
+        if not datapoints['_total']:
+            raise NagiosException('UNKNOWN', 'No valid datapoints found')
+
+        lengths = {}
+        t = datapoints['_total']
+        for key in NagiosException.NAGIOS_STATUSES.keys():
+            lengths[key] = len(datapoints[key])
+        #Very simple count, no timeseries evaluation, no flap detection.
+        if t < lengths['UNKNOWN']:
+            raise NagiosException('UNKNOWN', 'More than half of the datapoints 
are undefined')
+        for key in ['CRITICAL', 'WARNING']:
+            if lengths[key] >= t*self.perc/100.0:
+                perc = lengths[key]*100.0/t
+                raise NagiosException(key,
+                                      '%s%% of data exceeded the %s threshold 
[%s]' %
+                                      (perc, key.lower(), self.limits[key]))
+        raise NagiosException('OK', 'Less than %s%% data above the threshold 
[%s]' % (self.perc, self.limits['WARNING']))
+
+class Anomaly(GraphiteCheck):
+    """
+    Checks if the metric is out of the forecasted bounds for a number of times 
in the last iterations
+    """
+    parser_name='check_anomaly'
+
+    @classmethod
+    def create_parser(cls,parser):
+        p = super(Anomaly,cls).create_parser(parser)
+        p.add_argument('--check_window', dest="check_window", type=int, 
help='How many datapoints to consider in the anomaly detection sampling', 
default=20)
+        return p
+
+    def get_all(self,args):
+        self.params = [('format', 'json')]
+        for target in self.targets:
+            self.params.append(('target', target))
+            self.params.append(('target', 'holtWintersConfidenceBands(%s)' % 
target))
+        self.check_window = args.check_window
+        self.warn = args.warn
+        self.crit = args.crit
+
+
+    def parse_result(self, result):
+        #TODO: make this work for lists of results
+        datapoints = defaultdict(list)
+        my_slice = self.check_window * -1
+        measures = result[0]['datapoints'][my_slice:]
+        lowerbound = result[1]['datapoints'][my_slice:]
+        upperbound = result[2]['datapoints'][my_slice:]
+        for i in xrange(self.check_window):
+            data, time = measures[i]
+            l = lowerbound[i][0]
+            u = upperbound[i][0]
+            if not isinstance(data, Real):
+                datapoints['unknown'].append((time, data))
+            elif data >= u:
+                datapoints['higher'].append((time,data))
+            elif data <= l:
+                datapoints['lower'].append((time,data))
+            else:
+                datapoints['ok'].append((time,data))
+        return datapoints
+
+    def check_data(self, datapoints):
+        u = len(datapoints['unknown'])
+        h = len(datapoints['higher'])
+        l = len(datapoints['lower'])
+        ok = len(datapoints['ok'])
+        t = h + l + ok
+        if not t:
+            raise NagiosException('UNKNOWN', 'No valid datapoints found')
+
+        if t < u:
+            raise NagiosException('UNKNOWN', 'More than half of the datapoints 
are undefined')
+
+        #Simple check, with basic flap detection
+        crit = (h >= self.crit) or (l >= self.crit)
+        crit_flap = (h >= self.crit) and (l >= self.crit)
+        if (h >= self.crit) or (l >= self.crit):
+            if (h >= self.crit) and (l >= self.crit):
+                raise NagiosException('UNKNOWN', 'Service is critically 
flapping below and above the confidence bounds')
+            raise NagiosException('CRITICAL', 'Anomaly detected: %s data above 
and %s below the confidence bounds' % (h, l))
+
+        if (h >= self.warn) or (l >= self.warn):
+            if (h >= self.warn) and (l >= self.warn):
+                raise NagiosException('UNKNOWN', 'Service is flapping below 
and above the confidence bounds')
+            raise NagiosException('WARNING', 'Anomaly detected: %s data above 
and %s below the confidence bounds' % (h, l))
+
+        raise NagiosException('OK', 'No anomaly detected')
+
+
+def main():
+    """
+    Controller for the graphite fetching plugin.
+
+    You can build a few different type of checks, both traditional nagios 
checks
+    and anomaly detection ones.
+
+    Examples:
+
+    Check if a metric exceeds a certain value 10 times in the last 20 minutes:
+
+    ./check_graphyte.py --url http://some-graphite-host \
+           check_threshold my.beloved.metric  --from -20m \
+           --threshold 100 --over -C 10 -W 5
+
+    Check if a metric has exceeded its holter-winters confidence bands 5% of 
the
+    times over the last 500 checks
+
+    ./check_graphyte.py --url http://some-graphite-host  \
+          check_anomaly my.beloved.metric --check_window 500 -C 5 -W 1
+
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    subparsers = parser.add_subparsers(
+        title='check_type',
+        help='use with --help for additional help',
+        dest='check_type')
+
+    threshold = Threshold.create_parser(subparsers)
+
+    anomaly = Anomaly.create_parser(subparsers)
+
+
+    parser.add_argument('-U', '--url', dest='url',
+                        default=os.environ.get('GRAPHITE_URL', 
'http://localhost'),
+                        help='Url of the graphite server'
+                        )
+    parser.add_argument('-T', '--timeout', dest='timeout', default=10, 
help='Timeout on the graphite call (defaults to 10)')
+    parser.add_argument('-S', '--client-ssl-cert', dest='ssl_certs', 
default=None, help='SSL client certificate to use in connection (filename)')
+
+
+    args = parser.parse_args()
+
+    try:
+        checker = args.check_class(args)
+        checker.run()
+    except NagiosException as e:
+        print(e.msg)
+        e.exit()
 
 
 if __name__ == '__main__':
-    parser = optparse.OptionParser()
-    parser.add_option('-U', '--graphite-url', dest='graphite_url',
-                      default='http://localhost/',
-                      metavar='URL',
-                      help='Graphite URL [%default]')
-    parser.add_option('-t', '--target', dest='target',
-                      action='append',
-                      help='Target to check')
-    parser.add_option('--compare', dest='compare',
-                      metavar='SERIES',
-                      help='Compare TARGET against SERIES')
-    parser.add_option('--from', dest='_from',
-                      help='From timestamp/date')
-    parser.add_option('--until', dest='_until',
-                      default='now',
-                      help='Until timestamp/date [%default]')
-    parser.add_option('-c', '--count', dest='count',
-                      default=0,
-                      type='int',
-                      help='Alert on at least COUNT metrics [%default]')
-    parser.add_option('--beyond', dest='beyond',
-                      default=0.7,
-                      type='float',
-                      help='Alert if metric is PERCENTAGE beyond comparison 
value [%default]')
-    parser.add_option('--percentile', dest='percentile',
-                      default=0,
-                      type='int',
-                      metavar='PERCENT',
-                      help='Use nPercentile Graphite function on the target 
(returns one datapoint)')
-    parser.add_option('--empty-ok', dest='empty_ok',
-                      default=False,
-                      action='store_true',
-                      help='Empty data from Graphite is OK')
-    parser.add_option('--confidence', dest='confidence_bands',
-                      default=False,
-                      action='store_true',
-                      help='Use holtWintersConfidenceBands Graphite function 
on the target')
-    parser.add_option('--over', dest='over',
-                      default=True,
-                      action='store_true',
-                      help='Over specified WARNING or CRITICAL threshold 
[%default]')
-    parser.add_option('--under', dest='under',
-                      default=False,
-                      action='store_true',
-                      help='Under specified WARNING or CRITICAL threshold 
[%default]')
-    parser.add_option('-W', dest='warning',
-                      type='float',
-                      metavar='VALUE',
-                      help='Warning if datapoints beyond VALUE')
-    parser.add_option('-C', dest='critical',
-                      type='float',
-                      metavar='VALUE',
-                      help='Critical if datapoints beyond VALUE')
-
-    (options, args) = parser.parse_args()
-
-    if not all([getattr(options, option) for option in ('_from', 'target')]):
-        parser.print_help()
-        sys.exit(NAGIOS_STATUSES['UNKNOWN'])
-
-    real_from = options._from
-
-    if options.under:
-        check_func = lambda x, y: x < y
-        options.over = False
-    else:
-        check_func = lambda x, y: x > y
-
-    if options.confidence_bands:
-        targets = [options.target[0], 'holtWintersConfidenceBands(%s)' % 
options.target[0]]
-        check_threshold = None
-        from_slice = int(options._from) * -1
-        real_from = '-2w'
-
-        if options.compare:
-            targets.append(options.compare)
-    else:
-        if not all([getattr(options, option) for option in ('critical', 
'warning')]):
-            parser.print_help()
-            sys.exit(NAGIOS_STATUSES['UNKNOWN'])
-
-        if options.percentile:
-            targets = ['nPercentile(%s, %d)' % (options.target[0], 
options.percentile)]
-        else:
-            targets = options.target
-
-        try:
-            warn = float(options.warning)
-            crit = float(options.critical)
-        except ValueError:
-            print 'ERROR: WARNING or CRITICAL threshold is not a number\n'
-            parser.print_help()
-            sys.exit(NAGIOS_STATUSES['UNKNOWN'])
-
-    check_output = {}
-    graphite = Graphite(options.graphite_url, targets, real_from, 
options._until)
-    metric_data = graphite.fetch_metrics()
-
-    if metric_data:
-        if options.confidence_bands:
-            actual = [x[0] for x in metric_data[0].get('datapoints', 
[])][from_slice:]
-            target_name = metric_data[0]['target']
-            kwargs = {}
-            kwargs['beyond'] = options.beyond
-
-            if options.over:
-                kwargs['bounds'] = [x[0] for x in 
metric_data[1].get('datapoints', [])][from_slice:]
-            elif options.under:
-                kwargs['bounds'] = [x[0] for x in 
metric_data[2].get('datapoints', [])][from_slice:]
-
-            if options.compare:
-                kwargs['compare'] = [x[0] for x in 
metric_data[3].get('datapoints', [])][from_slice:]
-
-                if not graphite.has_numbers(kwargs['compare']):
-                    print 'CRITICAL: No compare target output from Graphite!'
-                    sys.exit(NAGIOS_STATUSES['CRITICAL'])
-
-            if graphite.has_numbers(actual) and 
graphite.has_numbers(kwargs['bounds']):
-                points_oob = graphite.check_datapoints(actual, check_func, 
**kwargs)
-                check_output[target_name] = graphite.generate_output(actual,
-                                                                     
points_oob,
-                                                                     
count=options.count,
-                                                                     
target=target_name)
-
-            else:
-                print 'CRITICAL: No output from Graphite for target(s): %s' % 
', '.join(targets)
-                sys.exit(NAGIOS_STATUSES['CRITICAL'])
-        else:
-            for target in metric_data:
-                datapoints = [x[0] for x in target.get('datapoints', []) if 
isinstance(x[0], Real)]
-                if not graphite.has_numbers(datapoints) and not 
options.empty_ok:
-                    print 'CRITICAL: No output from Graphite for target(s): 
%s' % ', '.join(targets)
-                    sys.exit(NAGIOS_STATUSES['CRITICAL'])
-
-                crit_oob = graphite.check_datapoints(datapoints, check_func, 
threshold=crit)
-                warn_oob = graphite.check_datapoints(datapoints, check_func, 
threshold=warn)
-                check_output[target['target']] = 
graphite.generate_output(datapoints,
-                                                                          
warn_oob,
-                                                                          
crit_oob,
-                                                                          
count=options.count,
-                                                                          
target=target['target'],
-                                                                          
warning=warn,
-                                                                          
critical=crit)
-    else:
-        if options.empty_ok and isinstance(metric_data, list):
-            print 'OK: No output from Graphite for target(s): %s' % ', 
'.join(targets)
-            sys.exit(NAGIOS_STATUSES['OK'])
-
-        print 'CRITICAL: No output from Graphite for target(s): %s' % ', 
'.join(targets)
-        sys.exit(NAGIOS_STATUSES['CRITICAL'])
-
-    for target, messages in check_output.iteritems():
-        if messages['CRITICAL']:
-            exit_code = NAGIOS_STATUSES['CRITICAL']
-        elif messages['WARNING']:
-            exit_code = NAGIOS_STATUSES['WARNING']
-        else:
-            exit_code = NAGIOS_STATUSES['OK']
-
-        for status_code in ['CRITICAL', 'WARNING', 'OK']:
-            if messages[status_code]:
-                print '\n'.join(['%s: %s' % (status_code, status) for status 
in messages[status_code]])
-
-    sys.exit(exit_code)
+    #TODO - fix the docs
+    __doc__ = main.__doc__
+    main()
diff --git a/templates/icinga/checkcommands.cfg.erb 
b/templates/icinga/checkcommands.cfg.erb
index ea9babc..b5ac22e 100644
--- a/templates/icinga/checkcommands.cfg.erb
+++ b/templates/icinga/checkcommands.cfg.erb
@@ -511,7 +511,7 @@
 
 define command{
        command_name    check_reqstats_5xx
-       command_line    $USER1$/check_graphite -U $ARG1$ --from $ARG2$  -t 
reqstats.5xx -W $ARG3$ -C $ARG4$
+       command_line    $USER1$/check_graphite -U $ARG1$ check_threshold 
reqstats.5xx -W $ARG3$ -C $ARG4$ --from $ARG2$  
 }
 
 # Checks whether a host belongs to given dsh group(s)

-- 
To view, visit https://gerrit.wikimedia.org/r/125726
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ief0c4e6e50eda99cf7f94d8c5674d5573a8fc895
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Giuseppe Lavagetto <glavage...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to