Nuria has uploaded a new change for review.
https://gerrit.wikimedia.org/r/189588
Change subject: Correcting docs and thresholds for eventlogging alarms
......................................................................
Correcting docs and thresholds for eventlogging alarms
Change-Id: I846bfe5fefa706cdb43424725bc6ac06d77a3ad7
---
M modules/eventlogging/manifests/monitoring.pp
1 file changed, 9 insertions(+), 3 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/88/189588/1
diff --git a/modules/eventlogging/manifests/monitoring.pp
b/modules/eventlogging/manifests/monitoring.pp
index 8768823..7918526 100644
--- a/modules/eventlogging/manifests/monitoring.pp
+++ b/modules/eventlogging/manifests/monitoring.pp
@@ -41,8 +41,10 @@
# of incoming events.
class eventlogging::monitoring::graphite {
- # Warn if 1% of overall event throughput goes beyond 500 events/s
+ # Warn if 15% of overall event throughput goes beyond 500 events/s
# in a 15 min period
+ # Note that these 'per second' rates are calculated per minute in txstatsts
+ # thus in 15 mins we are going to have 15 measures
# These thresholds are somewhat arbtirary at this point, but it
# was seen that the current setup can handle 500 events/s.
# Better thresholds are pending (see T86244).
@@ -51,12 +53,15 @@
metric => 'eventlogging.overall.raw.rate',
warning => 500,
critical => 600,
+ percentage => 15, # At least 3 of the 15 readings
from => '15min',
contact_group => 'analytics'
}
- # Alarms if 1% of Navigation Timing event throughput goes under 2 req/sec
+ # Alarms if 15% of Navigation Timing event throughput goes under 2 req/sec
# in a 15 min period
+ # Note that these 'per second' rates are calculated per minute in txstatsts
+ # thus in 15 mins we are going to have 15 measures
# https://meta.wikimedia.org/wiki/Schema:NavigationTiming
# Note:
# you can test this via doing:
@@ -64,13 +69,14 @@
# --url http://graphite.wikimedia.org check_threshold
# eventlogging.schema.NavigationTiming.rate --from 15min -C 1 -W 2 --under
# it will report the following:
- # OK: Less than 1.00% data above the threshold [2.0]
+ # OK: Less than 15% data above the threshold [2.0]
# but actually the check is correct is checking points below threshold
monitoring::graphite_threshold {
'eventlogging_NavigationTiming_throughput':
description => 'Throughput of event logging NavigationTiming
events',
metric => 'eventlogging.schema.NavigationTiming.rate',
warning => 2,
critical => 1,
+ percentage => 15, # At least 3 of the 15 readings
from => '15min',
contact_group => 'analytics',
under => true
--
To view, visit https://gerrit.wikimedia.org/r/189588
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I846bfe5fefa706cdb43424725bc6ac06d77a3ad7
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Nuria <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits