Elukey has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/363154 )
Change subject: role::analytics_cluster::hadoop::master: add more monitors to HDFS metrics ...................................................................... role::analytics_cluster::hadoop::master: add more monitors to HDFS metrics Bug: T163908 Change-Id: I22910c9e216b3af5211a7c2d7053e75bd4fc745f --- M modules/role/manifests/analytics_cluster/hadoop/master.pp 1 file changed, 33 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/54/363154/1 diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp b/modules/role/manifests/analytics_cluster/hadoop/master.pp index c078465..00c5990 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/master.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp @@ -92,6 +92,39 @@ ], } + # Alert if the HDFS space consumption raises above a safe threshold. + monitoring::graphite_threshold { 'hadoop-hdfs-percent-used': + description => 'HDFS capacity used percentage', + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean", + from => '30min', + warning => 70, + critical => 80, + percentage => '60', + contact_group => 'analytics', + } + + # Alert in case of HDFS currupted or missing blocks. In the ideal state + # these values should always be 0. + monitoring::graphite_threshold { 'hadoop-hdfs-corrupt-blocks': + description => 'HDFS missing blocks', + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean", + from => '30min', + warning => 70, + critical => 80, + percentage => '60', + contact_group => 'analytics', + } + + monitoring::graphite_threshold { 'hadoop-hdfs-missing-blocks': + description => 'HDFS corrupted blocks', + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.MissingBlocks.mean", + from => '180min', + warning => 2, + critical => 5, + percentage => '60', + contact_group => 'analytics', + } + # Java heap space used alerts. # The goal is to get alarms for long running memory leaks like T153951. # Only include heap size alerts if heap size is configured. -- To view, visit https://gerrit.wikimedia.org/r/363154 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I22910c9e216b3af5211a7c2d7053e75bd4fc745f Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Elukey <ltosc...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits