Elukey has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/363307 )

Change subject: role::analytics_cluster::hadoop::master: add icinga check for 
HDFS topology
......................................................................

role::analytics_cluster::hadoop::master: add icinga check for HDFS topology

Bug: T163909
Change-Id: I6f1c259eb872abcca60d96add2bfa89a0546a14d
---
A modules/role/files/analytics_cluster/hadoop/check_hdfs_topology
M modules/role/manifests/analytics_cluster/hadoop/master.pp
2 files changed, 31 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/07/363307/1

diff --git a/modules/role/files/analytics_cluster/hadoop/check_hdfs_topology 
b/modules/role/files/analytics_cluster/hadoop/check_hdfs_topology
new file mode 100644
index 0000000..919d806
--- /dev/null
+++ b/modules/role/files/analytics_cluster/hadoop/check_hdfs_topology
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# The HDFS topology should not contain any host in the 'default' rack.
+sudo -u hdfs dfsadmin -printTopology | grep -q 'Rack: default'
+
+if [ $? -eq 1 ]; then
+    echo "OK"
+else
+    echo "CRITICAL: There is at least one node in the default rack."
+fi
\ No newline at end of file
diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp 
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index 2ec2da4..4e8cce6 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -46,6 +46,14 @@
         require => Class['cdh::hadoop::master'],
     }
 
+    file { '/usr/local/lib/nagios/plugins/check_hdfs_topology':
+        ensure => present,
+        source => 
'puppet:///modules/role/analytics_cluster/hadoop/check_hdfs_topology',
+        mode   => '0555',
+        owner  => 'root',
+        group  => 'root',
+    }
+
     # Include icinga alerts if production realm.
     if $::realm == 'production' {
         # Icinga process alerts for NameNode, ResourceManager and HistoryServer
@@ -76,10 +84,21 @@
             require       => Class['cdh::hadoop::master'],
         }
 
-        # Allow nagios to run the check_hdfs_active_namenode as hdfs user.
+        # Allow nagios to run some scripts as hdfs user.
         sudo::user { 'nagios-check_hdfs_active_namenode':
             user       => 'nagios',
-            privileges => ['ALL = NOPASSWD: 
/usr/local/bin/check_hdfs_active_namenode'],
+            privileges => [
+                'ALL = NOPASSWD: /usr/local/bin/check_hdfs_active_namenode',
+                'ALL = NOPASSWD: 
/usr/local/lib/nagios/plugins/check_hdfs_topology',
+            ],
+        }
+        # Alert if the HDFS topology shows any inconsistency.
+        nrpe::monitor_service { 'check_hdfs_topology':
+            description    => 'HDFS topology check',
+            nrpe_command   => '/usr/bin/sudo 
/usr/local/lib/nagios/plugins/check_hdfs_topology',
+            check_interval => 30,
+            retries        => 2,
+            require        => 
File['/usr/local/lib/nagios/plugins/check_hdfs_topology'],
         }
         # Alert if there is no active NameNode
         nrpe::monitor_service { 'hadoop-hdfs-active-namenode':

-- 
To view, visit https://gerrit.wikimedia.org/r/363307
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I6f1c259eb872abcca60d96add2bfa89a0546a14d
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <ltosc...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to