Elukey has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/339640 )
Change subject: Tune JVM Heap size alarms for Hadoop daemons ...................................................................... Tune JVM Heap size alarms for Hadoop daemons These alarms are new and need to be tuned properly. For the moment, limiting the scope of the alerts to analytics to avoid spamming ops. Change-Id: I16d7f422698d771accbda3d7390d437f4d18e1e3 --- M modules/role/manifests/analytics_cluster/hadoop/master.pp M modules/role/manifests/analytics_cluster/hadoop/standby.pp M modules/role/manifests/analytics_cluster/hadoop/worker.pp 3 files changed, 18 insertions(+), 18 deletions(-) Approvals: Elukey: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp b/modules/role/manifests/analytics_cluster/hadoop/master.pp index 2a6ed45..c078465 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/master.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp @@ -97,8 +97,8 @@ # Only include heap size alerts if heap size is configured. $hadoop_namenode_heapsize = hiera('cdh::hadoop::namenode_heapsize', undef) if $hadoop_namenode_heapsize { - $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.8 - $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.9 + $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9 + $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-hdfs-namenode-heap-usaage': description => 'HDFS active Namenode JVM Heap usage', metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper", @@ -106,14 +106,14 @@ warning => $nn_jvm_warning_threshold, critical => $nn_jvm_critical_threshold, percentage => '60', - contact_group => 'admins,analytics', + contact_group => 'analytics', } } $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize if $hadoop_resourcemanager_heapsize { - $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.8 - $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.9 + $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9 + $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-yarn-resourcemananager-heap-usage': description => 'YARN active ResourceManager JVM Heap usage', metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", @@ -121,7 +121,7 @@ warning => $rm_jvm_warning_threshold, critical => $rm_jvm_critical_threshold, percentage => '60', - contact_group => 'admins,analytics', + contact_group => 'analytics', } } } diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp b/modules/role/manifests/analytics_cluster/hadoop/standby.pp index 2e9c7c8..3e03d14 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp @@ -38,8 +38,8 @@ # The goal is to get alarms for long running memory leaks like T153951. # Only include heap size alerts if heap size is configured. if $hadoop_namenode_heapsize { - $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.8 - $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.9 + $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9 + $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-hdfs-namenode-heap-usaage': description => 'HDFS standby Namenode JVM Heap usage', metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper", @@ -47,7 +47,7 @@ warning => $nn_jvm_warning_threshold, critical => $nn_jvm_critical_threshold, percentage => '60', - contact_group => 'admins,analytics', + contact_group => 'analytics', } } } @@ -75,8 +75,8 @@ # Only include heap size alerts if heap size is configured. $hadoop_resourcemanager_heapsize = $::cdh::hadoop::yarn_heapsize if $hadoop_resourcemanager_heapsize { - $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.8 - $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.9 + $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9 + $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-yarn-resourcemananager-heap-usage': description => 'YARN standby Resource Manager JVM Heap usage', metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", @@ -84,7 +84,7 @@ warning => $rm_jvm_warning_threshold, critical => $rm_jvm_critical_threshold, percentage => '60', - contact_group => 'admins,analytics', + contact_group => 'analytics', } } } diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp b/modules/role/manifests/analytics_cluster/hadoop/worker.pp index d4d856e..4040669 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp @@ -62,8 +62,8 @@ # Only include heap size alerts if heap size is configured. $hadoop_datanode_heapsize = $::cdh::hadoop::hadoop_heapsize if $hadoop_datanode_heapsize { - $dn_jvm_warning_threshold = $hadoop_datanode_heapsize * 0.8 - $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.9 + $dn_jvm_warning_threshold = $hadoop_datanode_heapsize * 0.9 + $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.95 monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode': description => 'HDFS DataNode JVM Heap usage', metric => "Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper", @@ -71,14 +71,14 @@ warning => $dn_jvm_critical_threshold, critical => $dn_jvm_critical_threshold, percentage => '60', - contact_group => 'admins,analytics', + contact_group => 'analytics', } } $hadoop_nodemanager_heapsize = $::cdh::hadoop::yarn_heapsize if $hadoop_nodemanager_heapsize { - $nm_jvm_warning_threshold = $hadoop_nodemanager_heapsize * 0.8 - $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.9 + $nm_jvm_warning_threshold = $hadoop_nodemanager_heapsize * 0.9 + $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.95 monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager': description => 'YARN NodeManager JVM Heap usage', metric => "Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper", @@ -86,7 +86,7 @@ warning => $nm_jvm_critical_threshold, critical => $nm_jvm_critical_threshold, percentage => '60', - contact_group => 'admins,analytics', + contact_group => 'analytics', } } -- To view, visit https://gerrit.wikimedia.org/r/339640 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I16d7f422698d771accbda3d7390d437f4d18e1e3 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Elukey <ltosc...@wikimedia.org> Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org> Gerrit-Reviewer: Ottomata <ao...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits