Ottomata has submitted this change and it was merged.

Change subject: Adding alerts for webrequest data loss in HDFS
......................................................................


Adding alerts for webrequest data loss in HDFS

Change-Id: If91ce8badded15a2d15e8a0be42735ebe80f5968
---
M manifests/misc/analytics.pp
M manifests/misc/monitoring.pp
M templates/icinga/checkcommands.cfg.erb
3 files changed, 28 insertions(+), 1 deletion(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git a/manifests/misc/analytics.pp b/manifests/misc/analytics.pp
index 66655da..bea13d8 100644
--- a/manifests/misc/analytics.pp
+++ b/manifests/misc/analytics.pp
@@ -35,4 +35,4 @@
                check_command         => 
"check_kafka_broker_produce_requests!3!2",
                contact_group         => "analytics",
        }
-}
\ No newline at end of file
+}
diff --git a/manifests/misc/monitoring.pp b/manifests/misc/monitoring.pp
index 5f6c6c1..a2ec64c 100644
--- a/manifests/misc/monitoring.pp
+++ b/manifests/misc/monitoring.pp
@@ -58,6 +58,20 @@
                        source => 
"puppet:///files/ganglia/plugins/kraken_webrequest_loss.pyconf",
                        notify => Service[gmond];
        }
+
+       # Set up icinga monitoring of Kraken HDFS data loss.
+       monitor_service { "kraken_webrequest_loss_average_positive":
+               description           => "webrequest_loss_average_positive",
+               check_command         => 
"check_kraken_webrequest_loss_positive!2!8",
+               contact_group         => "analytics",
+       }
+       # It is possible to have negative data loss.  This would mean that
+       # we are receiving duplicates log lines.  We need alerts for this too.
+       monitor_service { "kraken_webrequest_loss_average_negative":
+               description           => "webrequest_loss_average_negative",
+               check_command         => 
"check_kraken_webrequest_loss_negative!-2!-8",
+               contact_group         => "analytics",
+       }
 }
 
 # Ganglia views that should be
diff --git a/templates/icinga/checkcommands.cfg.erb 
b/templates/icinga/checkcommands.cfg.erb
index 830ba4a..b156f7e 100644
--- a/templates/icinga/checkcommands.cfg.erb
+++ b/templates/icinga/checkcommands.cfg.erb
@@ -621,4 +621,17 @@
        command_line    $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ 
-m kafka_network_SocketServerStats.ProduceRequestsPerSecond -w $ARG1$ -c $ARG2$ 
-o lt
 }
 
+# Alerts for data loss in Kraken HDFS.
+define command{
+       command_name    check_kraken_webrequest_loss_positive
+       command_line    $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ 
-m webrequest_loss_average -w $ARG1$ -c $ARG2$ -o gt
+}
+
+# Data loss percentage CAN be negative if we receive duplicate traffic
+# (this has happened before).  We need an extra alert if the percentages goes 
negative.
+define command{
+       command_name    check_kraken_webrequest_loss_negative
+       command_line    $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ 
-m webrequest_loss_average -w $ARG1$ -c $ARG2$ -o lt
+}
+
 

-- 
To view, visit https://gerrit.wikimedia.org/r/66241
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: If91ce8badded15a2d15e8a0be42735ebe80f5968
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to