Ottomata has submitted this change and it was merged.

Change subject: Test addition of --until parameter to check_graphite check
......................................................................


Test addition of --until parameter to check_graphite check

Commit https://gerrit.wikimedia.org/r/#/c/254846/3 was reverted
when it unexpectedly broke most check_graphite alerts.  This change
applies it, but only to a single check on cp1054.  The
check_graphite_until_temp python script has been manually
placed in /usr/lib/nagios/plugins on neon.  Once we verify that
this script and check with $until works, we will remove these
conditionals and re-apply the original changes.

Bug: T116035
Change-Id: I9873428d6cd5db24a1a31348b01d5b77e4c50bb7
---
M modules/monitoring/manifests/graphite_threshold.pp
M modules/nagios_common/files/check_commands/check_graphite.cfg
M modules/role/manifests/cache/kafka/webrequest.pp
3 files changed, 61 insertions(+), 19 deletions(-)

Approvals:
  Ottomata: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/monitoring/manifests/graphite_threshold.pp 
b/modules/monitoring/manifests/graphite_threshold.pp
index 6ea2bfa..7ae1408 100644
--- a/modules/monitoring/manifests/graphite_threshold.pp
+++ b/modules/monitoring/manifests/graphite_threshold.pp
@@ -54,6 +54,8 @@
     $critical,
     $series                = false,
     $from                  = '10min',
+    # temporarly use $until to conditionally use check_graphite_until command
+    $until                 = undef,
     $percentage            = 1,
     $under                 = false,
     $graphite_url          = 'http://graphite.wikimedia.org',
@@ -80,33 +82,56 @@
     #   $ARG3$  the metric to monitor
     #   $ARG4$  -W warning threshold
     #   $ARG5$  -C critical threshold
-    #   $ARG6$  --from start sampling date
-    #   $ARG7$  --perc percentage of exceeding datapoints
-    #   $ARG8$  --over or --under
+    #   $ARG6$  --from start sampling date (negative relative time from now)
+    #####   $ARG7$  --until end sampling date (negative relative time from now)
+    #   $ARG8$  --perc percentage of exceeding datapoints
+    #   $ARG9$  --over or --under
     $modifier = $under ? {
         true  => '--under',
         default => '--over'
-    }
-    $command = $series ? {
-        true    => 'check_graphite_series_threshold',
-        default => 'check_graphite_threshold'
     }
 
     if $metric =~ /'/ {
         fail("single quotes will be stripped from graphite metric ${metric}, 
consider using double quotes")
     }
 
-    monitoring::service { $title:
-        ensure                => $ensure,
-        description           => $description,
-        check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}",
-        retries               => $retries,
-        group                 => $group,
-        critical              => $nagios_critical,
-        passive               => $passive,
-        freshness             => $freshness,
-        normal_check_interval => $normal_check_interval,
-        retry_check_interval  => $retry_check_interval,
-        contact_group         => $contact_group,
+    # TEMPORARY conditional to test the --until arg without affecting all
+    # alerts. This conditional will be removed once we are sure until works.
+    if $until and !$series {
+        $command = 'check_graphite_threshold_until_temp'
+
+        monitoring::service { $title:
+            ensure                => $ensure,
+            description           => $description,
+            check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${until}!${percentage}!${modifier}",
+            retries               => $retries,
+            group                 => $group,
+            critical              => $nagios_critical,
+            passive               => $passive,
+            freshness             => $freshness,
+            normal_check_interval => $normal_check_interval,
+            retry_check_interval  => $retry_check_interval,
+            contact_group         => $contact_group,
+        }
+    }
+    else {
+        $command = $series ? {
+            true    => 'check_graphite_series_threshold',
+            default => 'check_graphite_threshold'
+        }
+
+        monitoring::service { $title:
+            ensure                => $ensure,
+            description           => $description,
+            check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}",
+            retries               => $retries,
+            group                 => $group,
+            critical              => $nagios_critical,
+            passive               => $passive,
+            freshness             => $freshness,
+            normal_check_interval => $normal_check_interval,
+            retry_check_interval  => $retry_check_interval,
+            contact_group         => $contact_group,
+        }
     }
 }
diff --git a/modules/nagios_common/files/check_commands/check_graphite.cfg 
b/modules/nagios_common/files/check_commands/check_graphite.cfg
index b8baf3a..68c60ac 100644
--- a/modules/nagios_common/files/check_commands/check_graphite.cfg
+++ b/modules/nagios_common/files/check_commands/check_graphite.cfg
@@ -14,3 +14,11 @@
     command_name    check_graphite_anomaly
     command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly 
'$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$ $ARG7$
 }
+
+# NOTE: This is a temporary check command that will be removed once it is
+# verified to not break things and can be integrated into the regular
+# check_graphite_threshold.
+define command{
+    command_name    check_graphite_threshold_until_temp
+    command_line    $USER1$/check_graphite_until_temp -U $ARG1$ -T $ARG2$ 
check_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ 
--perc $ARG8$ $ARG9$
+}
diff --git a/modules/role/manifests/cache/kafka/webrequest.pp 
b/modules/role/manifests/cache/kafka/webrequest.pp
index 39bc41f..4c18059 100644
--- a/modules/role/manifests/cache/kafka/webrequest.pp
+++ b/modules/role/manifests/cache/kafka/webrequest.pp
@@ -75,6 +75,14 @@
         logster_options => "-o statsd --statsd-host=statsd.eqiad.wmnet:8125 
--metric-prefix=${graphite_metric_prefix}",
     }
 
+
+    # TEMPORARY test --until only on a single host
+    $until = $::fqdn ? {
+        'cp1054.eqiad.wmnet' => '0min',
+        default              => undef,
+    }
+
+
     # Generate an alert if too many delivery report errors per minute
     # (logster only reports once a minute)
     monitoring::graphite_threshold { 'varnishkafka-kafka_drerr':
@@ -88,6 +96,7 @@
         # are over the threshold.
         percentage      => 80,
         from            => '10min',
+        until           => $until,
         nagios_critical => false,
         require         => Logster::Job['varnishkafka-webrequest'],
         ensure          => 'present',

-- 
To view, visit https://gerrit.wikimedia.org/r/255415
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I9873428d6cd5db24a1a31348b01d5b77e4c50bb7
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: BBlack <bbl...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to