Elukey has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/337575 )

Change subject: Fix and tune the new Analytics Hadoop alarms
......................................................................


Fix and tune the new Analytics Hadoop alarms

Bug: T88640
Change-Id: I1e47c128ca04dc48690ecbd5d70fa7ee154b7423
---
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
3 files changed, 8 insertions(+), 8 deletions(-)

Approvals:
  Elukey: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp 
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index 3303f5e..8963ac1 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -94,7 +94,7 @@
 
         # Java heap space used alerts
         # The goal is to get alarms for long running memory leaks like T153951
-        $namenode_jvm_warning_threshold  = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+        $namenode_jvm_warning_threshold  = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.8
         $namenode_jvm_critical_threshold = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
         monitoring::graphite_threshold { 'analytics_hadoop_hdfs_namenode':
             description   => 'HDFS active Namenode JVM Heap usage',
@@ -106,11 +106,11 @@
             contact_group => 'admins,analytics',
         }
 
-        $rm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+        $rm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.8
         $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
         monitoring::graphite_threshold { 
'analytics_hadoop_yarn_resource_manager':
             description   => 'Yarn active ResourceManager JVM Heap usage',
-            metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9980.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+            metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
             from          => '60min',
             warning       => $rm_jvm_warning_threshold,
             critical      => $rm_jvm_critical_threshold,
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp 
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 7367790..2042cdf 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -35,7 +35,7 @@
 
         # Java heap space used alerts
         # The goal is to get alarms for long running memory leaks like T153951
-        $namenode_jvm_warning_threshold  = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
+        $namenode_jvm_warning_threshold  = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.8
         $namenode_jvm_critical_threshold = 
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
         monitoring::graphite_threshold { 'analytics_hadoop_namenode_hdfs':
             description   => 'HDFS standby Namenode JVM Heap usage',
@@ -66,11 +66,11 @@
 
         # Java heap space used alerts
         # The goal is to get alarms for long running memory leaks like T153951
-        $rm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+        $rm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.8
         $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
         monitoring::graphite_threshold { 
'analytics_hadoop_yarn_resource_manager':
             description   => 'YARN Resource Manager JVM Heap usage',
-            metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9984.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
+            metric        => 
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
             from          => '60min',
             warning       => $rm_jvm_warning_threshold,
             critical      => $rm_jvm_critical_threshold,
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp 
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index 7aff0ce..f169109 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -59,9 +59,9 @@
 
         # Java heap space used alerts
         # The goal is to get alarms for long running memory leaks like T153951
-        $dn_jvm_warning_threshold  = hiera(cdh::hadoop::hadoop_heapsize) * 0.7
+        $dn_jvm_warning_threshold  = hiera(cdh::hadoop::hadoop_heapsize) * 0.8
         $dn_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.9
-        $nm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.7
+        $nm_jvm_warning_threshold  = hiera(cdh::hadoop::yarn_heapsize) * 0.8
         $nm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
         monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager':
             description   => 'YARN NodeManager JVM Heap usage',

-- 
To view, visit https://gerrit.wikimedia.org/r/337575
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1e47c128ca04dc48690ecbd5d70fa7ee154b7423
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <ltosc...@wikimedia.org>
Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to