Elukey has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/339640 )

Change subject: Tune JVM Heap size alarms for Hadoop daemons
......................................................................


Tune JVM Heap size alarms for Hadoop daemons

These alarms are new and need to be tuned properly.
For the moment, limiting the scope of the alerts to
analytics to avoid spamming ops.

Change-Id: I16d7f422698d771accbda3d7390d437f4d18e1e3
---
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
3 files changed, 18 insertions(+), 18 deletions(-)

Approvals:
  Elukey: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp 
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index 2a6ed45..c078465 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -97,8 +97,8 @@
         # Only include heap size alerts if heap size is configured.
         $hadoop_namenode_heapsize = hiera('cdh::hadoop::namenode_heapsize', 
undef)
         if $hadoop_namenode_heapsize {
-            $nn_jvm_warning_threshold  = $hadoop_namenode_heapsize * 0.8
-            $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.9
+            $nn_jvm_warning_threshold  = $hadoop_namenode_heapsize * 0.9
+            $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95
             monitoring::graphite_threshold { 
'hadoop-hdfs-namenode-heap-usaage':
                 description   => 'HDFS active Namenode JVM Heap usage',
                 metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
@@ -106,14 +106,14 @@
                 warning       => $nn_jvm_warning_threshold,
                 critical      => $nn_jvm_critical_threshold,
                 percentage    => '60',
-                contact_group => 'admins,analytics',
+                contact_group => 'analytics',
             }
         }
 
         $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize
         if $hadoop_resourcemanager_heapsize {
-            $rm_jvm_warning_threshold  = $hadoop_resourcemanager_heapsize * 0.8
-            $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.9
+            $rm_jvm_warning_threshold  = $hadoop_resourcemanager_heapsize * 0.9
+            $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 
0.95
             monitoring::graphite_threshold { 
'hadoop-yarn-resourcemananager-heap-usage':
                 description   => 'YARN active ResourceManager JVM Heap usage',
                 metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
@@ -121,7 +121,7 @@
                 warning       => $rm_jvm_warning_threshold,
                 critical      => $rm_jvm_critical_threshold,
                 percentage    => '60',
-                contact_group => 'admins,analytics',
+                contact_group => 'analytics',
             }
         }
     }
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp 
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 2e9c7c8..3e03d14 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -38,8 +38,8 @@
         # The goal is to get alarms for long running memory leaks like T153951.
         # Only include heap size alerts if heap size is configured.
         if $hadoop_namenode_heapsize {
-            $nn_jvm_warning_threshold  = $hadoop_namenode_heapsize * 0.8
-            $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.9
+            $nn_jvm_warning_threshold  = $hadoop_namenode_heapsize * 0.9
+            $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95
             monitoring::graphite_threshold { 
'hadoop-hdfs-namenode-heap-usaage':
                 description   => 'HDFS standby Namenode JVM Heap usage',
                 metric        => 
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
@@ -47,7 +47,7 @@
                 warning       => $nn_jvm_warning_threshold,
                 critical      => $nn_jvm_critical_threshold,
                 percentage    => '60',
-                contact_group => 'admins,analytics',
+                contact_group => 'analytics',
             }
         }
     }
@@ -75,8 +75,8 @@
             # Only include heap size alerts if heap size is configured.
             $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize
             if $hadoop_resourcemanager_heapsize {
-                $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 
0.8
-                $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize 
* 0.9
+                $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 
0.9
+                $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize  
* 0.95
                 monitoring::graphite_threshold { 
'hadoop-yarn-resourcemananager-heap-usage':
                     description   => 'YARN standby Resource Manager JVM Heap 
usage',
                     metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
@@ -84,7 +84,7 @@
                     warning       => $rm_jvm_warning_threshold,
                     critical      => $rm_jvm_critical_threshold,
                     percentage    => '60',
-                    contact_group => 'admins,analytics',
+                    contact_group => 'analytics',
                 }
             }
         }
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp 
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index d4d856e..4040669 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -62,8 +62,8 @@
         # Only include heap size alerts if heap size is configured.
         $hadoop_datanode_heapsize = $::cdh::hadoop::hadoop_heapsize
         if $hadoop_datanode_heapsize {
-            $dn_jvm_warning_threshold  = $hadoop_datanode_heapsize * 0.8
-            $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.9
+            $dn_jvm_warning_threshold  = $hadoop_datanode_heapsize * 0.9
+            $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.95
             monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode':
                 description   => 'HDFS DataNode JVM Heap usage',
                 metric        => 
"Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper",
@@ -71,14 +71,14 @@
                 warning       => $dn_jvm_critical_threshold,
                 critical      => $dn_jvm_critical_threshold,
                 percentage    => '60',
-                contact_group => 'admins,analytics',
+                contact_group => 'analytics',
             }
         }
 
         $hadoop_nodemanager_heapsize = $::cdh::hadoop::yarn_heapsize
         if $hadoop_nodemanager_heapsize {
-            $nm_jvm_warning_threshold  = $hadoop_nodemanager_heapsize * 0.8
-            $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.9
+            $nm_jvm_warning_threshold  = $hadoop_nodemanager_heapsize * 0.9
+            $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.95
             monitoring::graphite_threshold { 
'analytics_hadoop_yarn_nodemanager':
                 description   => 'YARN NodeManager JVM Heap usage',
                 metric        => 
"Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper",
@@ -86,7 +86,7 @@
                 warning       => $nm_jvm_critical_threshold,
                 critical      => $nm_jvm_critical_threshold,
                 percentage    => '60',
-                contact_group => 'admins,analytics',
+                contact_group => 'analytics',
             }
         }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/339640
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I16d7f422698d771accbda3d7390d437f4d18e1e3
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <ltosc...@wikimedia.org>
Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org>
Gerrit-Reviewer: Ottomata <ao...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to