Elukey has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/337575 )
Change subject: Fix and tune the new Analytics Hadoop alarms ...................................................................... Fix and tune the new Analytics Hadoop alarms Bug: T88640 Change-Id: I1e47c128ca04dc48690ecbd5d70fa7ee154b7423 --- M modules/role/manifests/analytics_cluster/hadoop/master.pp M modules/role/manifests/analytics_cluster/hadoop/standby.pp M modules/role/manifests/analytics_cluster/hadoop/worker.pp 3 files changed, 8 insertions(+), 8 deletions(-) Approvals: Elukey: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp b/modules/role/manifests/analytics_cluster/hadoop/master.pp index 3303f5e..8963ac1 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/master.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp @@ -94,7 +94,7 @@ # Java heap space used alerts # The goal is to get alarms for long running memory leaks like T153951 - $namenode_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7 + $namenode_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.8 $namenode_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9 monitoring::graphite_threshold { 'analytics_hadoop_hdfs_namenode': description => 'HDFS active Namenode JVM Heap usage', @@ -106,11 +106,11 @@ contact_group => 'admins,analytics', } - $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7 + $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.8 $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9 monitoring::graphite_threshold { 'analytics_hadoop_yarn_resource_manager': description => 'Yarn active ResourceManager JVM Heap usage', - metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9980.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", + metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", from => '60min', warning => $rm_jvm_warning_threshold, critical => $rm_jvm_critical_threshold, diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp b/modules/role/manifests/analytics_cluster/hadoop/standby.pp index 7367790..2042cdf 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp @@ -35,7 +35,7 @@ # Java heap space used alerts # The goal is to get alarms for long running memory leaks like T153951 - $namenode_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7 + $namenode_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.8 $namenode_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9 monitoring::graphite_threshold { 'analytics_hadoop_namenode_hdfs': description => 'HDFS standby Namenode JVM Heap usage', @@ -66,11 +66,11 @@ # Java heap space used alerts # The goal is to get alarms for long running memory leaks like T153951 - $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7 + $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.8 $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9 monitoring::graphite_threshold { 'analytics_hadoop_yarn_resource_manager': description => 'YARN Resource Manager JVM Heap usage', - metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9984.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", + metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", from => '60min', warning => $rm_jvm_warning_threshold, critical => $rm_jvm_critical_threshold, diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp b/modules/role/manifests/analytics_cluster/hadoop/worker.pp index 7aff0ce..f169109 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp @@ -59,9 +59,9 @@ # Java heap space used alerts # The goal is to get alarms for long running memory leaks like T153951 - $dn_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.7 + $dn_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.8 $dn_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.9 - $nm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7 + $nm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.8 $nm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9 monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager': description => 'YARN NodeManager JVM Heap usage', -- To view, visit https://gerrit.wikimedia.org/r/337575 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I1e47c128ca04dc48690ecbd5d70fa7ee154b7423 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Elukey <ltosc...@wikimedia.org> Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits