Elukey has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/335648 )
Change subject: Revert "Add JVM Heap usage alarms for basic Hadoop daemons"
......................................................................
Revert "Add JVM Heap usage alarms for basic Hadoop daemons"
This reverts commit 4735b0f002a7ac5269186fd8399231cdcc5c2253.
Change-Id: I0843e88286cdcb3d7ff41fc844c93920ac1f2fb0
---
M hieradata/eqiad/cdh/hadoop.yaml
M modules/role/manifests/analytics_cluster/hadoop/master.pp
M modules/role/manifests/analytics_cluster/hadoop/standby.pp
M modules/role/manifests/analytics_cluster/hadoop/worker.pp
4 files changed, 1 insertion(+), 87 deletions(-)
Approvals:
Elukey: Looks good to me, approved
jenkins-bot: Verified
diff --git a/hieradata/eqiad/cdh/hadoop.yaml b/hieradata/eqiad/cdh/hadoop.yaml
index 04e1baf..39c9879 100644
--- a/hieradata/eqiad/cdh/hadoop.yaml
+++ b/hieradata/eqiad/cdh/hadoop.yaml
@@ -32,8 +32,6 @@
cdh::hadoop::net_topology_script_template:
'role/analytics_cluster/hadoop/net-topology.py.erb'
# Increase NameNode heapsize independent from other daemons
-# The opts value will be used for the JVM, meanwhile the raw number for alarms.
-cdh::hadoop::hadoop_namenode_heapsize: 4096
cdh::hadoop::hadoop_namenode_opts: "-Xmx4096m"
cdh::hadoop::mapreduce_reduce_shuffle_parallelcopies: 10
diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp
b/modules/role/manifests/analytics_cluster/hadoop/master.pp
index 4e976a4..d30450c 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/master.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp
@@ -90,32 +90,6 @@
Sudo::User['nagios-check_hdfs_active_namenode'],
],
}
-
- # Java heap space used alerts
- # The goal is to get alarms for long running memory leaks like T153951
- $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
- $namenode_jvm_critical_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
- monitoring::graphite_threshold { 'analytics_hadoop_hdfs_namenode':
- description => 'HDFS active Namenode JVM Heap usage',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $namenode_jvm_warning_threshold,
- critical => $namenode_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'admins,analytics',
- }
-
- $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
- $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
- monitoring::graphite_threshold {
'analytics_hadoop_yarn_resource_manager':
- description => 'Yarn active ResourceManager JVM Heap usage',
- metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9980.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $rm_jvm_warning_threshold,
- critical => $rm_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'admins,analytics',
- }
}
# Firewall
diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
index 3d30dfe..21c0455 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp
@@ -31,20 +31,6 @@
contact_group => 'admins,analytics',
require => Class['cdh::hadoop::namenode::standby'],
}
-
- # Java heap space used alerts
- # The goal is to get alarms for long running memory leaks like T153951
- $namenode_jvm_warning_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.7
- $namenode_jvm_critical_threshold =
hiera(cdh::hadoop::hadoop_namenode_heapsize) * 0.9
- monitoring::graphite_threshold { 'analytics_hadoop_namenode_hdfs':
- description => 'HDFS standby Namenode JVM Heap usage',
- metric =>
"Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $namenode_jvm_warning_threshold,
- critical => $namenode_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'admins,analytics',
- }
}
# Firewall
@@ -57,25 +43,6 @@
include ::cdh::hadoop::resourcemanager
# Firewall
include ::role::analytics_cluster::hadoop::ferm::resourcemanager
-
- # Use jmxtrans for sending metrics
- class { 'cdh::hadoop::jmxtrans::resourcemanager':
- statsd => hiera('statsd'),
- }
-
- # Java heap space used alerts
- # The goal is to get alarms for long running memory leaks like T153951
- $rm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
- $rm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
- monitoring::graphite_threshold {
'analytics_hadoop_yarn_resource_manager':
- description => 'YARN Resource Manager JVM Heap usage',
- metric =>
"Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9984.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $rm_jvm_warning_threshold,
- critical => $rm_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'admins,analytics',
- }
}
-}
+}
\ No newline at end of file
diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
index bda0a67..9030b91 100644
--- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp
+++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp
@@ -54,31 +54,6 @@
contact_group => 'admins,analytics',
retry_interval => 3,
}
-
- # Java heap space used alerts
- # The goal is to get alarms for long running memory leaks like T153951
- $dn_jvm_warning_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.7
- $dn_jvm_critical_threshold = hiera(cdh::hadoop::hadoop_heapsize) * 0.9
- $nm_jvm_warning_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.7
- $nm_jvm_critical_threshold = hiera(cdh::hadoop::yarn_heapsize) * 0.9
- monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager':
- description => 'YARN NodeManager JVM Heap usage',
- metric =>
"Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $dn_jvm_warning_threshold,
- critical => $dn_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'admins,analytics',
- }
- monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode':
- description => 'HDFS DataNode JVM Heap usage',
- metric =>
"Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper",
- from => '60min',
- warning => $nm_jvm_warning_threshold,
- critical => $nm_jvm_critical_threshold,
- percentage => '60',
- contact_group => 'admins,analytics',
- }
}
# hive::client is nice to have for jobs launched
--
To view, visit https://gerrit.wikimedia.org/r/335648
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I0843e88286cdcb3d7ff41fc844c93920ac1f2fb0
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Elukey <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits