Filippo Giunchedi has submitted this change and it was merged. Change subject: icinga: update checks after statsite migration ......................................................................
icinga: update checks after statsite migration Change-Id: Ief9572bef4872fc8999ef5ec4a3a3e67ac500d84 --- M manifests/role/cache.pp M manifests/role/labsnfs.pp M manifests/role/nova.pp M manifests/swift.pp M modules/mediawiki/manifests/monitoring/webserver.pp M modules/swift_new/manifests/monitoring/graphite.pp 6 files changed, 12 insertions(+), 12 deletions(-) Approvals: Filippo Giunchedi: Verified; Looks good to me, approved diff --git a/manifests/role/cache.pp b/manifests/role/cache.pp index 6c2a413..2342f0a 100644 --- a/manifests/role/cache.pp +++ b/manifests/role/cache.pp @@ -532,7 +532,7 @@ # (logster only reports once a minute) monitoring::graphite_threshold { 'varnishkafka-kafka_drerr': description => 'Varnishkafka Delivery Errors per minute', - metric => "derivative(${graphite_metric_prefix}.varnishkafka.kafka_drerr.value)", + metric => "derivative(${graphite_metric_prefix}.varnishkafka.kafka_drerr)", # warn if more than 0 errors per minute in the last 10 minutes warning => 0, # critical if more than 20000 errors per minute in the last 10 minutes diff --git a/manifests/role/labsnfs.pp b/manifests/role/labsnfs.pp index 416396e..c3253c1 100644 --- a/manifests/role/labsnfs.pp +++ b/manifests/role/labsnfs.pp @@ -55,7 +55,7 @@ monitoring::graphite_threshold { 'network_out_saturated': description => 'Outgoing network saturation', - metric => "servers.${::hostname}.network.${monitor_iface}.tx_byte.value", + metric => "servers.${::hostname}.network.${monitor_iface}.tx_byte", from => '30min', warning => '75000000', # roughly 600Mbps / 1Gbps critical => '100000000', # roughly 800Mbps / 1Gbps @@ -64,7 +64,7 @@ monitoring::graphite_threshold { 'network_in_saturated': description => 'Incoming network saturation', - metric => "servers.${::hostname}.network.${monitor_iface}.rx_byte.value", + metric => "servers.${::hostname}.network.${monitor_iface}.rx_byte", from => '30min', warning => '75000000', # roughly 600Mbps / 1Gbps critical => '100000000', # roughly 800Mbps / 1Gbps @@ -73,7 +73,7 @@ monitoring::graphite_threshold { 'high_iowait_stalling': description => 'Persistent high iowait', - metric => "servers.${::hostname}.cpu.total.iowait.value", + metric => "servers.${::hostname}.cpu.total.iowait", from => '10min', warning => '25', # Based off looking at history of metric critical => '35', @@ -83,7 +83,7 @@ # Monitor for high load consistently, is a 'catchall' monitoring::graphite_threshold { 'high_load': description => 'High load for whatever reason', - metric => "servers.${::hostname}.cpu.total.iowait.value", + metric => "servers.${::hostname}.cpu.total.iowait", from => '10min', warning => '16', critical => '24', diff --git a/manifests/role/nova.pp b/manifests/role/nova.pp index 37ff0b4..8e0adf4 100644 --- a/manifests/role/nova.pp +++ b/manifests/role/nova.pp @@ -362,7 +362,7 @@ # but graphite_threshold doesn't support that. monitoring::graphite_threshold { 'conntrack_saturated': description => 'Connection tracking saturation', - metric => "servers.${::hostname}.ConntrackCollector.network.netfilter.conntrack_count.value", + metric => "servers.${::hostname}.ConntrackCollector.network.netfilter.conntrack_count", from => '5min', warning => '241664', # (~90%) critical => '258048', # (~98%) diff --git a/manifests/swift.pp b/manifests/swift.pp index ca83755..a951fa0 100644 --- a/manifests/swift.pp +++ b/manifests/swift.pp @@ -195,7 +195,7 @@ class swift::monitoring::graphite { monitoring::graphite_threshold { 'swift_eqiad-prod_dispersion_object': description => 'swift eqiad-prod object availability', - metric => 'swift.eqiad-prod.dispersion.object.pct_found.value', + metric => 'swift.eqiad-prod.dispersion.object.pct_found', from => '1hours', warning => 95, critical => 90, @@ -205,7 +205,7 @@ monitoring::graphite_threshold { 'swift_eqiad-prod_dispersion_container': description => 'swift eqiad-prod container availability', - metric => 'swift.eqiad-prod.dispersion.container.pct_found.value', + metric => 'swift.eqiad-prod.dispersion.container.pct_found', from => '30min', warning => 92, critical => 88, diff --git a/modules/mediawiki/manifests/monitoring/webserver.pp b/modules/mediawiki/manifests/monitoring/webserver.pp index 79dfeb5..21ddfbc 100644 --- a/modules/mediawiki/manifests/monitoring/webserver.pp +++ b/modules/mediawiki/manifests/monitoring/webserver.pp @@ -17,7 +17,7 @@ monitoring::graphite_threshold { 'hhvm_queue_size': description => 'HHVM queue size', - metric => "servers.${::hostname}.hhvmHealthCollector.queued.value", + metric => "servers.${::hostname}.hhvmHealthCollector.queued", warning => 10, critical => 80, percentage => 30, @@ -26,7 +26,7 @@ monitoring::graphite_threshold { 'hhvm_load': description => 'HHVM busy threads', - metric => "servers.${::hostname}.hhvmHealthCollector.load.value", + metric => "servers.${::hostname}.hhvmHealthCollector.load", warning => $::mediawiki::hhvm::max_threads*0.6, critical => $::mediawiki::hhvm::max_threads * 0.9, percentage => 30, diff --git a/modules/swift_new/manifests/monitoring/graphite.pp b/modules/swift_new/manifests/monitoring/graphite.pp index 2b7460c..b48111c 100644 --- a/modules/swift_new/manifests/monitoring/graphite.pp +++ b/modules/swift_new/manifests/monitoring/graphite.pp @@ -3,7 +3,7 @@ ) { monitoring::graphite_threshold { "swift_${swift_cluster}_dispersion_object": description => "swift ${swift_cluster} object availability", - metric => "swift.${swift_cluster}.dispersion.object.pct_found.value", + metric => "swift.${swift_cluster}.dispersion.object.pct_found", from => '1hours', warning => 95, critical => 90, @@ -13,7 +13,7 @@ monitoring::graphite_threshold { "swift_${swift_cluster_dispersion_container}": description => "swift ${swift_cluster} container availability", - metric => "swift.${swift_cluster}.dispersion.container.pct_found.value", + metric => "swift.${swift_cluster}.dispersion.container.pct_found", from => '30min', warning => 92, critical => 88, -- To view, visit https://gerrit.wikimedia.org/r/203039 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ief9572bef4872fc8999ef5ec4a3a3e67ac500d84 Gerrit-PatchSet: 2 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Filippo Giunchedi <fgiunch...@wikimedia.org> Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits