Yuvipanda has uploaded a new change for review.
https://gerrit.wikimedia.org/r/161015
Change subject: labmon: Add basic graphite based monitoring for contint
......................................................................
labmon: Add basic graphite based monitoring for contint
Change-Id: Ia4b015369bfd2e978ca791293f864b0c533c1d9b
---
M files/icinga/contactgroups.cfg
M manifests/role/graphite.pp
A modules/contint/manifests/monitoring.pp
3 files changed, 84 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/15/161015/1
diff --git a/files/icinga/contactgroups.cfg b/files/icinga/contactgroups.cfg
index 7a5c27f..dcb665f 100644
--- a/files/icinga/contactgroups.cfg
+++ b/files/icinga/contactgroups.cfg
@@ -55,6 +55,11 @@
}
define contactgroup {
+ contactgroup_name contint
+ members krinkle,amusso
+}
+
+define contactgroup {
contactgroup_name wikidata
members wikidata-monitoring,jzerebecki,irc-wikidata
}
diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index 72d7a8a..a032c86 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -247,4 +247,5 @@
include role::beta::monitoring::graphite
include toollabs::monitoring::graphite
+ include contint::monitoring::graphite
}
diff --git a/modules/contint/manifests/monitoring.pp
b/modules/contint/manifests/monitoring.pp
new file mode 100644
index 0000000..2a45e9f
--- /dev/null
+++ b/modules/contint/manifests/monitoring.pp
@@ -0,0 +1,78 @@
+# = Class: contint::monitoring::graphite
+# Sets up graphite based icinga checks for all of integration
+class contint::monitoring::graphite {
+ monitor_graphite_threshold { 'contint-puppet-fail':
+ description => 'CI: Puppet failure events',
+ metric => 'integration.*.puppetagent.failed_events.value',
+ critical => 1,
+ warning => 1,
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+
+ monitor_graphite_threshold { 'contint-puppet-stale':
+ description => 'CI: Puppet freshness check',
+ metric =>
'integration.*.puppetagent.time_since_last_run.value',
+ warning => 3600, # 1h
+ critical => 43200, # 12h
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+
+ monitor_graphite_threshold { 'contint-low-space-var':
+ description => 'CI: Low disk space on /var',
+ metric => 'integration.*.diskspace._var.byte_avail.value',
+ warning => 67108864, # 512MiB
+ critical => 33554432, # 256MiB,
+ under => true,
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+
+ monitor_graphite_threshold { 'contint-low-space-root':
+ description => 'CI: Low disk space on /',
+ metric => 'integration.*.diskspace.root.byte_avail.value',
+ warning => 67108864, # 512MiB
+ critical => 33554432, # 256MiB,
+ under => true,
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+
+ monitor_graphite_threshold { 'contint-cpu-iowait':
+ description => 'CI: Excess CPU check: iowait',
+ metric => 'integration.*.cpu.total.iowait.value',
+ warning => 95,
+ critical => 99,
+ percentage => 100,
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+
+ monitor_graphite_threshold { 'contint-cpu-user':
+ description => 'CI: Excess CPU check: user',
+ metric => 'integration.*.cpu.total.user.value',
+ warning => 95,
+ critical => 99,
+ percentage => 100,
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+
+ monitor_graphite_threshold { 'contint-cpu-system':
+ description => 'CI: Excess CPU check: system',
+ metric => 'integration.*.cpu.total.system.value',
+ warning => 95,
+ critical => 99,
+ percentage => 100,
+ graphite_url => 'http://labmon1001.eqiad.wmnet',
+ contact_group => 'contint',
+ series => true,
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/161015
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia4b015369bfd2e978ca791293f864b0c533c1d9b
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Yuvipanda <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits