Joal has uploaded a new change for review. https://gerrit.wikimedia.org/r/254846
Change subject: Update monitoring function using graphite ...................................................................... Update monitoring function using graphite Add 'until' parameter providing shifted in time monitoring. Usefull for metrics 'eventlogging_difference_raw_validated' that raises false alarms due to graphite data not being up-to-date between (now-5mins) and now. Bug: T116035 Change-Id: I4b549e23b40bce53833d86b2d4a06206512c8b98 --- M modules/eventlogging/manifests/monitoring/graphite.pp M modules/monitoring/manifests/graphite_threshold.pp M modules/nagios_common/files/check_commands/check_graphite M modules/nagios_common/files/check_commands/check_graphite.cfg 4 files changed, 39 insertions(+), 29 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/46/254846/1 diff --git a/modules/eventlogging/manifests/monitoring/graphite.pp b/modules/eventlogging/manifests/monitoring/graphite.pp index 3f097b8..cdfa689 100644 --- a/modules/eventlogging/manifests/monitoring/graphite.pp +++ b/modules/eventlogging/manifests/monitoring/graphite.pp @@ -42,22 +42,18 @@ } # Warn/Alert if the difference between raw and valid EventLogging - # alerts gets too big. + # alerts gets too big. We put a 5 minute lag because of metrics + # not being correct in graphite before. # If the difference gets too big, either the validation step is # overloaded, or high volume schemas are failing validation. - # - # Since diffed series are not fully synchronized, the plain diff - # would gives a trajectory that is flip/flopping above and below - # zero ~50 events/s. Hence, we average the diff over 10 - # readings. That way, we dampen flip/flopping enough to get a - # characteristic that is worth alerting on. monitoring::graphite_threshold { 'eventlogging_difference_raw_validated': description => 'Difference between raw and validated EventLogging overall message rates', - metric => "movingAverage(absolute(diffSeries(${raw_events_rate_metric},${valid_events_rate_metric})),10)", + metric => "absolute(diffSeries(${raw_events_rate_metric},${valid_events_rate_metric}))", warning => 20, critical => 30, - percentage => 25, # At least 4 of the 15 readings - from => '15min', + percentage => 25, # At least 4 of the (20 - 5) = 15 readings + from => '20min', + until => '5min' contact_group => 'analytics', } } diff --git a/modules/monitoring/manifests/graphite_threshold.pp b/modules/monitoring/manifests/graphite_threshold.pp index 6ea2bfa..a9f51e3 100644 --- a/modules/monitoring/manifests/graphite_threshold.pp +++ b/modules/monitoring/manifests/graphite_threshold.pp @@ -54,6 +54,7 @@ $critical, $series = false, $from = '10min', + $until = '0min', $percentage = 1, $under = false, $graphite_url = 'http://graphite.wikimedia.org', @@ -80,9 +81,10 @@ # $ARG3$ the metric to monitor # $ARG4$ -W warning threshold # $ARG5$ -C critical threshold - # $ARG6$ --from start sampling date - # $ARG7$ --perc percentage of exceeding datapoints - # $ARG8$ --over or --under + # $ARG6$ --from start sampling date (negative relative time from now) + # $ARG7$ --until end sampling date (negative relative time from now) + # $ARG8$ --perc percentage of exceeding datapoints + # $ARG9$ --over or --under $modifier = $under ? { true => '--under', default => '--over' @@ -99,7 +101,7 @@ monitoring::service { $title: ensure => $ensure, description => $description, - check_command => "${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}", + check_command => "${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${until}!${percentage}!${modifier}", retries => $retries, group => $group, critical => $nagios_critical, diff --git a/modules/nagios_common/files/check_commands/check_graphite b/modules/nagios_common/files/check_commands/check_graphite index 9c344ee..71c178e 100755 --- a/modules/nagios_common/files/check_commands/check_graphite +++ b/modules/nagios_common/files/check_commands/check_graphite @@ -175,6 +175,11 @@ help='When to fetch the metric from (date or "-1d")', default='-1h') p.add_argument( + '--until', + dest='_until', + help='When to fetch the metric until (date or "-1d")', + default='-0min') + p.add_argument( '--over', dest='over', action='store_true', @@ -194,6 +199,17 @@ help='Number of datapoints above or below threshold that will raise the alarm') return p + def check_time_parameter(self, param_name, param_value): + m = self.from_regex.match(param_value) + if not m: + raise ValueError( + 'the value of the %s argument is invalid: %s' % + (param_name, param_value)) + if m.group(2) not in self._accepted_time_defs: + raise ValueError('The unit specification for %s' % param_name + + 'should be one of the following: %s' % + ','.join(self._accepted_time_defs)) + def get_all(self, args): ''' Gets additional data from the command-line args. @@ -204,18 +220,13 @@ _from = args._from if not args._from.startswith('-'): _from = '-%s' % args._from - m = self.from_regex.match(_from) - if not m: - raise ValueError( - 'the value of the --from argument is invalid: %s' % - args._from) - if m.group(2) not in self._accepted_time_defs: - raise ValueError( - 'The unit specification for --from should be one of the following: %s' % - ','.join( - self._accepted_time_defs)) + self.check_time_parameter('--from', _from) + _until = args._until + if not args._until.startswith('-'): + _until = '-%s' % args._until + self.check_time_parameter('--until', _until) - self.params = [('format', 'json'), ('from', _from)] + self.params = [('format', 'json'), ('from', _from), ('until', _until)] for target in self.targets: self.params.append(('target', target)) if args.under: @@ -519,11 +530,12 @@ Examples: - Check if a metric exceeds a certain value 10 times in the last 20 minutes: + Check if a metric exceeds a certain value 10 times in the last 20 minutes + with a 5 minutes lag: ./check_graphite --url http://some-graphite-host \ check_threshold my.beloved.metric --from -20minutes \ - --threshold 100 --over -C 10 -W 5 + --until -5minutes --threshold 100 --over -C 10 -W 5 Check if a metric has exceeded its holt-winters confidence bands 5% of the times over the last 500 checks diff --git a/modules/nagios_common/files/check_commands/check_graphite.cfg b/modules/nagios_common/files/check_commands/check_graphite.cfg index b8baf3a..e2d5421 100644 --- a/modules/nagios_common/files/check_commands/check_graphite.cfg +++ b/modules/nagios_common/files/check_commands/check_graphite.cfg @@ -2,12 +2,12 @@ # Generic checks for graphite define command{ command_name check_graphite_threshold - command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --perc $ARG7$ $ARG8$ + command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ --perc $ARG8$ $ARG9$ } define command{ command_name check_graphite_series_threshold - command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_series_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --perc $ARG7$ $ARG8$ + command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_series_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ --perc $ARG8$ $ARG9$ } define command{ -- To view, visit https://gerrit.wikimedia.org/r/254846 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I4b549e23b40bce53833d86b2d4a06206512c8b98 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Joal <j...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits