Dzahn has submitted this change and it was merged.

Change subject: labmon: Add basic graphite based monitoring for contint
......................................................................


labmon: Add basic graphite based monitoring for contint

Change-Id: Ia4b015369bfd2e978ca791293f864b0c533c1d9b
---
M files/icinga/contactgroups.cfg
M manifests/role/graphite.pp
A modules/contint/manifests/monitoring.pp
3 files changed, 84 insertions(+), 0 deletions(-)

Approvals:
  jenkins-bot: Verified
  Dzahn: Looks good to me, approved



diff --git a/files/icinga/contactgroups.cfg b/files/icinga/contactgroups.cfg
index 7a5c27f..dcb665f 100644
--- a/files/icinga/contactgroups.cfg
+++ b/files/icinga/contactgroups.cfg
@@ -55,6 +55,11 @@
 }
 
 define contactgroup {
+    contactgroup_name   contint
+    members             krinkle,amusso
+}
+
+define contactgroup {
     contactgroup_name   wikidata
     members             wikidata-monitoring,jzerebecki,irc-wikidata
 }
diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index 72d7a8a..a032c86 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -247,4 +247,5 @@
 
     include role::beta::monitoring::graphite
     include toollabs::monitoring::graphite
+    include contint::monitoring::graphite
 }
diff --git a/modules/contint/manifests/monitoring.pp 
b/modules/contint/manifests/monitoring.pp
new file mode 100644
index 0000000..2a45e9f
--- /dev/null
+++ b/modules/contint/manifests/monitoring.pp
@@ -0,0 +1,78 @@
+# = Class: contint::monitoring::graphite
+# Sets up graphite based icinga checks for all of integration
+class contint::monitoring::graphite {
+    monitor_graphite_threshold { 'contint-puppet-fail':
+        description     => 'CI: Puppet failure events',
+        metric          => 'integration.*.puppetagent.failed_events.value',
+        critical        => 1,
+        warning         => 1,
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+
+    monitor_graphite_threshold { 'contint-puppet-stale':
+        description     => 'CI: Puppet freshness check',
+        metric          => 
'integration.*.puppetagent.time_since_last_run.value',
+        warning         => 3600, # 1h
+        critical        => 43200, # 12h
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+
+    monitor_graphite_threshold { 'contint-low-space-var':
+        description     => 'CI: Low disk space on /var',
+        metric          => 'integration.*.diskspace._var.byte_avail.value',
+        warning         => 67108864, # 512MiB
+        critical        => 33554432, # 256MiB,
+        under           => true,
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+
+    monitor_graphite_threshold { 'contint-low-space-root':
+        description     => 'CI: Low disk space on /',
+        metric          => 'integration.*.diskspace.root.byte_avail.value',
+        warning         => 67108864, # 512MiB
+        critical        => 33554432, # 256MiB,
+        under           => true,
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+
+    monitor_graphite_threshold { 'contint-cpu-iowait':
+        description     => 'CI: Excess CPU check: iowait',
+        metric          => 'integration.*.cpu.total.iowait.value',
+        warning         => 95,
+        critical        => 99,
+        percentage      => 100,
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+
+    monitor_graphite_threshold { 'contint-cpu-user':
+        description     => 'CI: Excess CPU check: user',
+        metric          => 'integration.*.cpu.total.user.value',
+        warning         => 95,
+        critical        => 99,
+        percentage      => 100,
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+
+    monitor_graphite_threshold { 'contint-cpu-system':
+        description     => 'CI: Excess CPU check: system',
+        metric          => 'integration.*.cpu.total.system.value',
+        warning         => 95,
+        critical        => 99,
+        percentage      => 100,
+        graphite_url    => 'http://labmon1001.eqiad.wmnet',
+        contact_group   => 'contint',
+        series          => true,
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/161015
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia4b015369bfd2e978ca791293f864b0c533c1d9b
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Yuvipanda <[email protected]>
Gerrit-Reviewer: Dzahn <[email protected]>
Gerrit-Reviewer: Hashar <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to