Ottomata has uploaded a new change for review.
https://gerrit.wikimedia.org/r/258210
Change subject: Update kafka submodule, use kafka::server::monitoring class
from it in role::analytics::kafka::*
......................................................................
Update kafka submodule, use kafka::server::monitoring class from it in
role::analytics::kafka::*
Bug: T120957
Change-Id: I746d71f1b54eb4b913904c57248675a29d9fb417
---
M manifests/role/analytics/kafka.pp
M modules/kafka
2 files changed, 2 insertions(+), 51 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/10/258210/1
diff --git a/manifests/role/analytics/kafka.pp
b/manifests/role/analytics/kafka.pp
index f6b3d47..3a5789e 100644
--- a/manifests/role/analytics/kafka.pp
+++ b/manifests/role/analytics/kafka.pp
@@ -211,58 +211,9 @@
# Monitor kafka in production
if $::realm == 'production' {
- # Generate icinga alert if Kafka Server is not running.
- nrpe::monitor_service { 'kafka':
- description => 'Kafka Broker Server',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java -a "kafka.Kafka /etc/kafka/server.properties"',
- require => Class['::kafka::server'],
- critical => true,
+ class { '::kafka::server::monitoring':
+ nagios_servicegroup => 'analytics_eqiad',
}
-
- # Generate icinga alert if this jmxtrans instance is not running.
- nrpe::monitor_service { 'jmxtrans':
- description => 'jmxtrans',
- nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C
java --ereg-argument-array "-jar.+jmxtrans-all.jar"',
- require => Class['::kafka::server::jmxtrans'],
- }
-
- # Set up icinga monitoring of Kafka broker per second.
- # If this drops too low, trigger an alert.
- $nagios_servicegroup = 'analytics_eqiad'
-
- # jmxtrans statsd writer emits Kafka Broker fqdns in keys
- # by substiting '.' with '_' and suffixing the Broker port.
- $graphite_broker_key = regsubst("${::fqdn}_${jmx_port}", '\.', '_',
'G')
-
- # Alert if any Kafka has under replicated partitions.
- # If it does, this means a broker replica is falling behind
- # and will be removed from the ISR.
- monitoring::graphite_threshold {
'kafka-broker-UnderReplicatedPartitions':
- description => 'Kafka Broker Under Replicated Partitions',
- metric =>
"kafka.${graphite_broker_key}.kafka.server.ReplicaManager.UnderReplicatedPartitions.Value",
- # UnderReplicated partitions for more than a minute
- # or two shouldn't happen.
- warning => '1',
- critical => '10',
- require => Class['::kafka::server::jmxtrans'],
- group => $nagios_servicegroup,
- }
-
- # Alert if any Kafka Broker replica lag is too high
- monitoring::graphite_threshold { 'kafka-broker-Replica-MaxLag':
- description => 'Kafka Broker Replica Max Lag',
- metric =>
"kafka.${graphite_broker_key}.kafka.server.ReplicaFetcherManager.MaxLag.Value",
- # As of 2014-02 replag could catch up at more than 1000 msgs / sec,
- # (probably more like 2 or 3 K / second). At that rate, 1M messages
- # behind should catch back up in at least 30 minutes.
- warning => '1000000',
- critical => '5000000',
- require => Class['::kafka::server::jmxtrans'],
- group => $nagios_servicegroup,
- }
-
- # monitor disk statistics
- include role::analytics::monitor_disks
#firewall Kafka Broker
ferm::service { 'kafka-server':
diff --git a/modules/kafka b/modules/kafka
index b75128d..a0d532a 160000
--- a/modules/kafka
+++ b/modules/kafka
-Subproject commit b75128da275503c9523e7ec718e2003c8106da63
+Subproject commit a0d532a9b8436e0fb0a5767252502d951b8c548f
--
To view, visit https://gerrit.wikimedia.org/r/258210
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I746d71f1b54eb4b913904c57248675a29d9fb417
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits