Filippo Giunchedi has submitted this change and it was merged.

Change subject: configure additional Cassandra metric alerts
......................................................................


configure additional Cassandra metric alerts

Bug: T101764
Change-Id: I400e7e6a93fcb91872ff396cba7fc3471645e6bf
---
M manifests/role/restbase.pp
1 file changed, 82 insertions(+), 2 deletions(-)

Approvals:
  Filippo Giunchedi: Verified; Looks good to me, approved
  GWicke: Looks good to me, but someone else must approve



diff --git a/manifests/role/restbase.pp b/manifests/role/restbase.pp
index 588f51d..ac3c598 100644
--- a/manifests/role/restbase.pp
+++ b/manifests/role/restbase.pp
@@ -22,7 +22,7 @@
 
 class role::restbase::alerts {
     monitoring::graphite_threshold { 'restbase_request_5xx_rate':
-        description    => 'RESTBase req/s returning 5xx',
+        description    => 'RESTBase req/s returning 5xx 
(http://grafana.wikimedia.org/#/dashboard/db/restbase)',
         metric         => 
'transformNull(restbase.v1_page_html_-title-_-revision--_tid-.GET.5xx.sample_rate,
 0)',
         from           => '10min',
         warning        => '1', # 1 5xx/s
@@ -32,7 +32,7 @@
     }
 
     monitoring::graphite_threshold { 'restbase_html_storage_hit_latency':
-        description    => 'RESTBase HTML storage load mean latency ms',
+        description    => 'RESTBase HTML storage load mean latency ms 
(http://grafana.wikimedia.org/#/dashboard/db/restbase)',
         metric         => 
'movingMedian(restbase.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.mean,
 15)',
         from           => '10min',
         warning        => '25', # 25ms
@@ -40,4 +40,84 @@
         percentage     => '50',
         contact_group  => 'team-services',
     }
+
+    monitoring::graphite_threshold { 'restbase_html_storage_hit_latency_99p':
+        description    => 'RESTBase HTML storage load 99p latency ms 
(http://grafana.wikimedia.org/#/dashboard/db/restbase)',
+        metric         => 
'movingMedian(restbase.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.p99,
 15)',
+        from           => '10min',
+        warning        => '1500', # 1.5s
+        critical       => '3000', # 3s
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 
'restbase_cassandra_highest_storage_exceptions':
+        description    => 'RESTBase Cassandra highest storage exceptions 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-storage)',
+        metric         => 
'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.Exceptions.count),
 1)',
+        from           => '10min',
+        warning        => '5',
+        critical       => '10',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 'restbase_cassandra_highest_total_hints':
+        description    => 'RESTBase Cassandra highest total hints 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-storage)',
+        metric         => 
'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.TotalHints.count),
 1)',
+        from           => '10min',
+        warning        => '600',
+        critical       => '1000',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 
'restbase_cassandra_highest_pending_compactions':
+        description    => 'RESTBase Cassandra highest pending compactions 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-compaction)',
+        metric         => 
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.Compaction.PendingTasks.value,
 1)',
+        from           => '10min',
+        warning        => '100',
+        critical       => '400',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 
'restbase_cassandra_highest_sstables_per_read':
+        description    => 'RESTBase Cassandra highest SSTables per-read 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-cf-sstables-per-read)',
+        metric         => 
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ColumnFamily.all.SSTablesPerReadHistogram.99percentile,
 1)',
+        from           => '10min',
+        warning        => '6',
+        critical       => '10',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 
'restbase_cassandra_highest_tombstones_scanned':
+        description    => 'RESTBase Cassandra highest tombstones scanned 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-cf-tombstones-scanned)',
+        metric         => 
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ColumnFamily.all.TombstoneScannedHistogram.99percentile,
 1)',
+        from           => '10min',
+        warning        => '1000',
+        critical       => '1500',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 
'restbase_cassandra_highest_pending_internal':
+        description    => 'RESTBase Cassandra highest pending internal thread 
pool tasks 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-thread-pools)',
+        metric         => 
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ThreadPools.internal.*.PendingTasks.value,
 1)',
+        from           => '10min',
+        warning        => '500',
+        critical       => '1000',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
+
+    monitoring::graphite_threshold { 
'restbase_cassandra_highest_dropped_messages':
+        description    => 'RESTBase Cassandra highest dropped message rate 
(http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-dropped-messages)',
+        metric         => 
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.DroppedMessage.*.Dropped.1MinuteRate,
 1)',
+        from           => '10min',
+        warning        => '50',
+        critical       => '100',
+        percentage     => '50',
+        contact_group  => 'team-services',
+    }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/218408
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I400e7e6a93fcb91872ff396cba7fc3471645e6bf
Gerrit-PatchSet: 8
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Eevans <eev...@wikimedia.org>
Gerrit-Reviewer: Eevans <eev...@wikimedia.org>
Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org>
Gerrit-Reviewer: GWicke <gwi...@wikimedia.org>
Gerrit-Reviewer: Mobrovac <mobro...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to