Ottomata has submitted this change and it was merged. Change subject: Adding alerts for webrequest data loss in HDFS ......................................................................
Adding alerts for webrequest data loss in HDFS Change-Id: If91ce8badded15a2d15e8a0be42735ebe80f5968 --- M manifests/misc/analytics.pp M manifests/misc/monitoring.pp M templates/icinga/checkcommands.cfg.erb 3 files changed, 28 insertions(+), 1 deletion(-) Approvals: Ottomata: Verified; Looks good to me, approved diff --git a/manifests/misc/analytics.pp b/manifests/misc/analytics.pp index 66655da..bea13d8 100644 --- a/manifests/misc/analytics.pp +++ b/manifests/misc/analytics.pp @@ -35,4 +35,4 @@ check_command => "check_kafka_broker_produce_requests!3!2", contact_group => "analytics", } -} \ No newline at end of file +} diff --git a/manifests/misc/monitoring.pp b/manifests/misc/monitoring.pp index 5f6c6c1..a2ec64c 100644 --- a/manifests/misc/monitoring.pp +++ b/manifests/misc/monitoring.pp @@ -58,6 +58,20 @@ source => "puppet:///files/ganglia/plugins/kraken_webrequest_loss.pyconf", notify => Service[gmond]; } + + # Set up icinga monitoring of Kraken HDFS data loss. + monitor_service { "kraken_webrequest_loss_average_positive": + description => "webrequest_loss_average_positive", + check_command => "check_kraken_webrequest_loss_positive!2!8", + contact_group => "analytics", + } + # It is possible to have negative data loss. This would mean that + # we are receiving duplicates log lines. We need alerts for this too. + monitor_service { "kraken_webrequest_loss_average_negative": + description => "webrequest_loss_average_negative", + check_command => "check_kraken_webrequest_loss_negative!-2!-8", + contact_group => "analytics", + } } # Ganglia views that should be diff --git a/templates/icinga/checkcommands.cfg.erb b/templates/icinga/checkcommands.cfg.erb index 830ba4a..b156f7e 100644 --- a/templates/icinga/checkcommands.cfg.erb +++ b/templates/icinga/checkcommands.cfg.erb @@ -621,4 +621,17 @@ command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ -m kafka_network_SocketServerStats.ProduceRequestsPerSecond -w $ARG1$ -c $ARG2$ -o lt } +# Alerts for data loss in Kraken HDFS. +define command{ + command_name check_kraken_webrequest_loss_positive + command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ -m webrequest_loss_average -w $ARG1$ -c $ARG2$ -o gt +} + +# Data loss percentage CAN be negative if we receive duplicate traffic +# (this has happened before). We need an extra alert if the percentages goes negative. +define command{ + command_name check_kraken_webrequest_loss_negative + command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ -m webrequest_loss_average -w $ARG1$ -c $ARG2$ -o lt +} + -- To view, visit https://gerrit.wikimedia.org/r/66241 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: If91ce8badded15a2d15e8a0be42735ebe80f5968 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits