Ottomata has submitted this change and it was merged. Change subject: Test addition of --until parameter to check_graphite check ......................................................................
Test addition of --until parameter to check_graphite check Commit https://gerrit.wikimedia.org/r/#/c/254846/3 was reverted when it unexpectedly broke most check_graphite alerts. This change applies it, but only to a single check on cp1054. The check_graphite_until_temp python script has been manually placed in /usr/lib/nagios/plugins on neon. Once we verify that this script and check with $until works, we will remove these conditionals and re-apply the original changes. Bug: T116035 Change-Id: I9873428d6cd5db24a1a31348b01d5b77e4c50bb7 --- M modules/monitoring/manifests/graphite_threshold.pp M modules/nagios_common/files/check_commands/check_graphite.cfg M modules/role/manifests/cache/kafka/webrequest.pp 3 files changed, 61 insertions(+), 19 deletions(-) Approvals: Ottomata: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/monitoring/manifests/graphite_threshold.pp b/modules/monitoring/manifests/graphite_threshold.pp index 6ea2bfa..7ae1408 100644 --- a/modules/monitoring/manifests/graphite_threshold.pp +++ b/modules/monitoring/manifests/graphite_threshold.pp @@ -54,6 +54,8 @@ $critical, $series = false, $from = '10min', + # temporarly use $until to conditionally use check_graphite_until command + $until = undef, $percentage = 1, $under = false, $graphite_url = 'http://graphite.wikimedia.org', @@ -80,33 +82,56 @@ # $ARG3$ the metric to monitor # $ARG4$ -W warning threshold # $ARG5$ -C critical threshold - # $ARG6$ --from start sampling date - # $ARG7$ --perc percentage of exceeding datapoints - # $ARG8$ --over or --under + # $ARG6$ --from start sampling date (negative relative time from now) + ##### $ARG7$ --until end sampling date (negative relative time from now) + # $ARG8$ --perc percentage of exceeding datapoints + # $ARG9$ --over or --under $modifier = $under ? { true => '--under', default => '--over' - } - $command = $series ? { - true => 'check_graphite_series_threshold', - default => 'check_graphite_threshold' } if $metric =~ /'/ { fail("single quotes will be stripped from graphite metric ${metric}, consider using double quotes") } - monitoring::service { $title: - ensure => $ensure, - description => $description, - check_command => "${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}", - retries => $retries, - group => $group, - critical => $nagios_critical, - passive => $passive, - freshness => $freshness, - normal_check_interval => $normal_check_interval, - retry_check_interval => $retry_check_interval, - contact_group => $contact_group, + # TEMPORARY conditional to test the --until arg without affecting all + # alerts. This conditional will be removed once we are sure until works. + if $until and !$series { + $command = 'check_graphite_threshold_until_temp' + + monitoring::service { $title: + ensure => $ensure, + description => $description, + check_command => "${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${until}!${percentage}!${modifier}", + retries => $retries, + group => $group, + critical => $nagios_critical, + passive => $passive, + freshness => $freshness, + normal_check_interval => $normal_check_interval, + retry_check_interval => $retry_check_interval, + contact_group => $contact_group, + } + } + else { + $command = $series ? { + true => 'check_graphite_series_threshold', + default => 'check_graphite_threshold' + } + + monitoring::service { $title: + ensure => $ensure, + description => $description, + check_command => "${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}", + retries => $retries, + group => $group, + critical => $nagios_critical, + passive => $passive, + freshness => $freshness, + normal_check_interval => $normal_check_interval, + retry_check_interval => $retry_check_interval, + contact_group => $contact_group, + } } } diff --git a/modules/nagios_common/files/check_commands/check_graphite.cfg b/modules/nagios_common/files/check_commands/check_graphite.cfg index b8baf3a..68c60ac 100644 --- a/modules/nagios_common/files/check_commands/check_graphite.cfg +++ b/modules/nagios_common/files/check_commands/check_graphite.cfg @@ -14,3 +14,11 @@ command_name check_graphite_anomaly command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly '$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$ $ARG7$ } + +# NOTE: This is a temporary check command that will be removed once it is +# verified to not break things and can be integrated into the regular +# check_graphite_threshold. +define command{ + command_name check_graphite_threshold_until_temp + command_line $USER1$/check_graphite_until_temp -U $ARG1$ -T $ARG2$ check_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ --perc $ARG8$ $ARG9$ +} diff --git a/modules/role/manifests/cache/kafka/webrequest.pp b/modules/role/manifests/cache/kafka/webrequest.pp index 39bc41f..4c18059 100644 --- a/modules/role/manifests/cache/kafka/webrequest.pp +++ b/modules/role/manifests/cache/kafka/webrequest.pp @@ -75,6 +75,14 @@ logster_options => "-o statsd --statsd-host=statsd.eqiad.wmnet:8125 --metric-prefix=${graphite_metric_prefix}", } + + # TEMPORARY test --until only on a single host + $until = $::fqdn ? { + 'cp1054.eqiad.wmnet' => '0min', + default => undef, + } + + # Generate an alert if too many delivery report errors per minute # (logster only reports once a minute) monitoring::graphite_threshold { 'varnishkafka-kafka_drerr': @@ -88,6 +96,7 @@ # are over the threshold. percentage => 80, from => '10min', + until => $until, nagios_critical => false, require => Logster::Job['varnishkafka-webrequest'], ensure => 'present', -- To view, visit https://gerrit.wikimedia.org/r/255415 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I9873428d6cd5db24a1a31348b01d5b77e4c50bb7 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: BBlack <bbl...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits