Elukey has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/363154 )

Change subject: role::analytics_cluster::hadoop::master: add more monitors to 
HDFS metrics
......................................................................

role::analytics_cluster::hadoop::master: add more monitors to HDFS metrics

Bug: T163908
Change-Id: I22910c9e216b3af5211a7c2d7053e75bd4fc745f
---
M modules/role/manifests/analytics_cluster/hadoop/master.pp
1 file changed, 33 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/54/363154/1

diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp 
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index c078465..00c5990 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -92,6 +92,39 @@
             ],
         }
 
+        # Alert if the HDFS space consumption raises above a safe threshold.
+        monitoring::graphite_threshold { 'hadoop-hdfs-percent-used':
+            description   => 'HDFS capacity used percentage',
+            metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean",
+            from          => '30min',
+            warning       => 70,
+            critical      => 80,
+            percentage    => '60',
+            contact_group => 'analytics',
+        }
+
+        # Alert in case of HDFS currupted or missing blocks. In the ideal state
+        # these values should always be 0.
+        monitoring::graphite_threshold { 'hadoop-hdfs-corrupt-blocks':
+            description   => 'HDFS missing blocks',
+            metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean",
+            from          => '30min',
+            warning       => 70,
+            critical      => 80,
+            percentage    => '60',
+            contact_group => 'analytics',
+        }
+
+        monitoring::graphite_threshold { 'hadoop-hdfs-missing-blocks':
+            description   => 'HDFS corrupted blocks',
+            metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.MissingBlocks.mean",
+            from          => '180min',
+            warning       => 2,
+            critical      => 5,
+            percentage    => '60',
+            contact_group => 'analytics',
+        }
+
         # Java heap space used alerts.
         # The goal is to get alarms for long running memory leaks like T153951.
         # Only include heap size alerts if heap size is configured.

-- 
To view, visit https://gerrit.wikimedia.org/r/363154
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I22910c9e216b3af5211a7c2d7053e75bd4fc745f
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <ltosc...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to