Joal has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/254846

Change subject: Update monitoring function using graphite
......................................................................

Update monitoring function using graphite

Add 'until' parameter providing shifted in time monitoring.
Usefull for metrics 'eventlogging_difference_raw_validated' that raises false 
alarms due to graphite data not being up-to-date between (now-5mins) and now.

Bug: T116035
Change-Id: I4b549e23b40bce53833d86b2d4a06206512c8b98
---
M modules/eventlogging/manifests/monitoring/graphite.pp
M modules/monitoring/manifests/graphite_threshold.pp
M modules/nagios_common/files/check_commands/check_graphite
M modules/nagios_common/files/check_commands/check_graphite.cfg
4 files changed, 39 insertions(+), 29 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/46/254846/1

diff --git a/modules/eventlogging/manifests/monitoring/graphite.pp 
b/modules/eventlogging/manifests/monitoring/graphite.pp
index 3f097b8..cdfa689 100644
--- a/modules/eventlogging/manifests/monitoring/graphite.pp
+++ b/modules/eventlogging/manifests/monitoring/graphite.pp
@@ -42,22 +42,18 @@
     }
 
     # Warn/Alert if the difference between raw and valid EventLogging
-    # alerts gets too big.
+    # alerts gets too big. We put a 5 minute lag because of metrics
+    # not being correct in graphite before.
     # If the difference gets too big, either the validation step is
     # overloaded, or high volume schemas are failing validation.
-    #
-    # Since diffed series are not fully synchronized, the plain diff
-    # would gives a trajectory that is flip/flopping above and below
-    # zero ~50 events/s. Hence, we average the diff over 10
-    # readings. That way, we dampen flip/flopping enough to get a
-    # characteristic that is worth alerting on.
     monitoring::graphite_threshold { 'eventlogging_difference_raw_validated':
         description   => 'Difference between raw and validated EventLogging 
overall message rates',
-        metric        => 
"movingAverage(absolute(diffSeries(${raw_events_rate_metric},${valid_events_rate_metric})),10)",
+        metric        => 
"absolute(diffSeries(${raw_events_rate_metric},${valid_events_rate_metric}))",
         warning       => 20,
         critical      => 30,
-        percentage    => 25, # At least 4 of the 15 readings
-        from          => '15min',
+        percentage    => 25, # At least 4 of the (20 - 5) = 15 readings
+        from          => '20min',
+        until         => '5min'
         contact_group => 'analytics',
     }
 }
diff --git a/modules/monitoring/manifests/graphite_threshold.pp 
b/modules/monitoring/manifests/graphite_threshold.pp
index 6ea2bfa..a9f51e3 100644
--- a/modules/monitoring/manifests/graphite_threshold.pp
+++ b/modules/monitoring/manifests/graphite_threshold.pp
@@ -54,6 +54,7 @@
     $critical,
     $series                = false,
     $from                  = '10min',
+    $until                 = '0min',
     $percentage            = 1,
     $under                 = false,
     $graphite_url          = 'http://graphite.wikimedia.org',
@@ -80,9 +81,10 @@
     #   $ARG3$  the metric to monitor
     #   $ARG4$  -W warning threshold
     #   $ARG5$  -C critical threshold
-    #   $ARG6$  --from start sampling date
-    #   $ARG7$  --perc percentage of exceeding datapoints
-    #   $ARG8$  --over or --under
+    #   $ARG6$  --from start sampling date (negative relative time from now)
+    #   $ARG7$  --until end sampling date (negative relative time from now)
+    #   $ARG8$  --perc percentage of exceeding datapoints
+    #   $ARG9$  --over or --under
     $modifier = $under ? {
         true  => '--under',
         default => '--over'
@@ -99,7 +101,7 @@
     monitoring::service { $title:
         ensure                => $ensure,
         description           => $description,
-        check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}",
+        check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${until}!${percentage}!${modifier}",
         retries               => $retries,
         group                 => $group,
         critical              => $nagios_critical,
diff --git a/modules/nagios_common/files/check_commands/check_graphite 
b/modules/nagios_common/files/check_commands/check_graphite
index 9c344ee..71c178e 100755
--- a/modules/nagios_common/files/check_commands/check_graphite
+++ b/modules/nagios_common/files/check_commands/check_graphite
@@ -175,6 +175,11 @@
             help='When to fetch the metric from (date or "-1d")',
             default='-1h')
         p.add_argument(
+            '--until',
+            dest='_until',
+            help='When to fetch the metric until (date or "-1d")',
+            default='-0min')
+        p.add_argument(
             '--over',
             dest='over',
             action='store_true',
@@ -194,6 +199,17 @@
             help='Number of datapoints above or below threshold that will 
raise the alarm')
         return p
 
+    def check_time_parameter(self, param_name, param_value):
+        m = self.from_regex.match(param_value)
+        if not m:
+            raise ValueError(
+                'the value of the %s argument is invalid: %s' %
+                (param_name, param_value))
+        if m.group(2) not in self._accepted_time_defs:
+            raise ValueError('The unit specification for %s' % param_name +
+                             'should be one of the following: %s' %
+                             ','.join(self._accepted_time_defs))
+
     def get_all(self, args):
         '''
         Gets additional data from the command-line args.
@@ -204,18 +220,13 @@
         _from = args._from
         if not args._from.startswith('-'):
             _from = '-%s' % args._from
-        m = self.from_regex.match(_from)
-        if not m:
-            raise ValueError(
-                'the value of the --from argument is invalid: %s' %
-                args._from)
-        if m.group(2) not in self._accepted_time_defs:
-            raise ValueError(
-                'The unit specification for --from should be one of the 
following: %s' %
-                ','.join(
-                    self._accepted_time_defs))
+        self.check_time_parameter('--from', _from)
+        _until = args._until
+        if not args._until.startswith('-'):
+            _until = '-%s' % args._until
+        self.check_time_parameter('--until', _until)
 
-        self.params = [('format', 'json'), ('from', _from)]
+        self.params = [('format', 'json'), ('from', _from), ('until', _until)]
         for target in self.targets:
             self.params.append(('target', target))
         if args.under:
@@ -519,11 +530,12 @@
 
     Examples:
 
-    Check if a metric exceeds a certain value 10 times in the last 20 minutes:
+    Check if a metric exceeds a certain value 10 times in the last 20 minutes
+    with a 5 minutes lag:
 
     ./check_graphite --url http://some-graphite-host \
            check_threshold my.beloved.metric  --from -20minutes \
-           --threshold 100 --over -C 10 -W 5
+           --until -5minutes --threshold 100 --over -C 10 -W 5
 
     Check if a metric has exceeded its holt-winters confidence bands 5% of the
     times over the last 500 checks
diff --git a/modules/nagios_common/files/check_commands/check_graphite.cfg 
b/modules/nagios_common/files/check_commands/check_graphite.cfg
index b8baf3a..e2d5421 100644
--- a/modules/nagios_common/files/check_commands/check_graphite.cfg
+++ b/modules/nagios_common/files/check_commands/check_graphite.cfg
@@ -2,12 +2,12 @@
 # Generic checks for graphite
 define command{
     command_name    check_graphite_threshold
-    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_threshold 
'$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --perc $ARG7$ $ARG8$
+    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_threshold 
'$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ --perc $ARG8$ $ARG9$
 }
 
 define command{
     command_name    check_graphite_series_threshold
-    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ 
check_series_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --perc $ARG7$ 
$ARG8$
+    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ 
check_series_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until 
$ARG7$ --perc $ARG8$ $ARG9$
 }
 
 define command{

-- 
To view, visit https://gerrit.wikimedia.org/r/254846
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4b549e23b40bce53833d86b2d4a06206512c8b98
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Joal <j...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to