Filippo Giunchedi has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/197352

Change subject: graphite: add error alerts
......................................................................

graphite: add error alerts

provide alerts for error situations, e.g. queue dropping datapoints, too many
metric creations, file write errors

Bug: T92965
Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
---
M manifests/role/graphite.pp
A modules/graphite/manifests/monitoring/graphite.pp
2 files changed, 38 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/52/197352/1

diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index 4fce5df..315539f 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -228,6 +228,7 @@
     include ::eventlogging::monitoring::graphite
     include ::swift::monitoring::graphite
     include ::swift_new::monitoring::graphite
+    include ::graphite::monitoring::graphite
 
     # Monitor production 5xx rates
     monitoring::graphite_threshold { 'reqstats_5xx':
diff --git a/modules/graphite/manifests/monitoring/graphite.pp 
b/modules/graphite/manifests/monitoring/graphite.pp
new file mode 100644
index 0000000..d4ef199
--- /dev/null
+++ b/modules/graphite/manifests/monitoring/graphite.pp
@@ -0,0 +1,37 @@
+class graphite::monitoring::graphite {
+    monitoring::graphite_threshold { 'carbon-relay queue full':
+        description     => 'carbon-relay queue full',
+        metric          => 
'sumSeries(carbon.relays.graphite1001-*.destinations.*.fullQueueDrops)',
+        from            => '10minutes',
+        warning         => 200,
+        critical        => 1000,
+        nagios_critical => false
+    }
+
+    monitoring::graphite_threshold { 'carbon-cache write error':
+        description     => 'carbon-cache write error',
+        metric          => 
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.errors))',
+        from            => '10minutes',
+        warning         => 1,
+        critical        => 8,
+        nagios_critical => false
+    }
+
+    monitoring::graphite_threshold { 'carbon-cache overflows':
+        description     => 'carbon-cache queues overflow',
+        metric          => 
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.cache.overflow))',
+        from            => '10minutes',
+        warning         => 1,
+        critical        => 8,
+        nagios_critical => false
+    }
+
+    monitoring::graphite_threshold { 'carbon-cache creates':
+        description     => 'carbon-cache too many creates',
+        metric          => 'sumSeries(carbon.agents.graphite1001-*.creates)',
+        from            => '1hour',
+        warning         => 200,
+        critical        => 1000,
+        nagios_critical => false
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/197352
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <fgiunch...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to