Ottomata has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/381489 )

Change subject: [WIP] Prometheus based Kafka broker alerts, take 1
......................................................................

[WIP] Prometheus based Kafka broker alerts, take 1

This refactors the Prometheus JXM exporter just a bit, moving
it to a separate profile::kafka::broker::monitoring class,
along with icinga alerts.

Bug: T175923
Change-Id: I839d5de4110da245f712e23285280c2fd546fe8f
---
M hieradata/role/common/kafka/jumbo/broker.yaml
M modules/profile/manifests/kafka/broker.pp
A modules/profile/manifests/kafka/broker/monitoring.pp
3 files changed, 87 insertions(+), 39 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/89/381489/1

diff --git a/hieradata/role/common/kafka/jumbo/broker.yaml 
b/hieradata/role/common/kafka/jumbo/broker.yaml
index 5fb6770..5b18607 100644
--- a/hieradata/role/common/kafka/jumbo/broker.yaml
+++ b/hieradata/role/common/kafka/jumbo/broker.yaml
@@ -2,8 +2,8 @@
 
 profile::kafka::broker::kafka_cluster_name: jumbo
 
-# Enable the Prometheus JMX Exporter
-profile::kafka::broker::prometheus_monitoring_enabled: true
+# Enable Monitoring (via Prometheus) and icinga alerts
+profile::kafka::broker::monitoring_enabled: true
 
 profile::kafka::broker::log_dirs: [/srv/kafka/data]
 profile::kafka::broker::plaintext: true
@@ -28,5 +28,5 @@
 profile::kafka::broker::num_recovery_threads_per_data_dir: 12
 profile::kafka::broker::num_io_threads: 12
 
-profile::kafka::broker::replica_maxlag_warning: "1000000"
-profile::kafka::broker::replica_maxlag_critical: "5000000"
+profile::kafka::broker::monitoring::replica_maxlag_warning: 1000000
+profile::kafka::broker::monitoring::replica_maxlag_critical: 5000000
diff --git a/modules/profile/manifests/kafka/broker.pp 
b/modules/profile/manifests/kafka/broker.pp
index 0ce4f1a..e0b3e71 100644
--- a/modules/profile/manifests/kafka/broker.pp
+++ b/modules/profile/manifests/kafka/broker.pp
@@ -87,7 +87,7 @@
     $replica_maxlag_critical           = 
hiera('profile::kafka::broker::replica_maxlag_critical'),
     # This is set via top level hiera variable so it can be synchronized 
between roles and clients.
     $message_max_bytes                 = hiera('kafka_message_max_bytes'),
-    $prometheus_monitoring_enabled     = 
hiera('profile::kafka::broker::prometheus_monitoring_enabled'),
+    $monitoring_enabled               = 
hiera('profile::kafka::broker::monitoring_enabled'),
     $prometheus_nodes                  = hiera('prometheus_nodes'),
 ) {
     # TODO: WIP
@@ -185,39 +185,6 @@
         java_home     => '/usr/lib/jvm/java-8-openjdk-amd64',
     }
 
-    if $prometheus_monitoring_enabled {
-        # Allow automatic generation of config on the
-        # Prometheus master
-        prometheus::jmx_exporter_instance { $::hostname:
-            address => $::ipaddress,
-            port    => 7800,
-        }
-
-        $prometheus_nodes_ferm = join($prometheus_nodes, ' ')
-        ferm::service { 'kafka-broker-jmx_exporter':
-            proto  => 'tcp',
-            port   => '7800',
-            srange => "@resolve((${prometheus_nodes_ferm}))",
-        }
-
-        require_package('prometheus-jmx-exporter')
-
-        $jmx_exporter_config_file = 
'/etc/kafka/broker_prometheus_jmx_exporter.yaml'
-        $java_opts = 
"-javaagent:/usr/share/java/prometheus/jmx_prometheus_javaagent.jar=${::ipaddress}:7800:${jmx_exporter_config_file}"
-
-        # Create the Prometheus JMX Exporter configuration
-        file { $jmx_exporter_config_file:
-            ensure  => present,
-            source  => 
'puppet:///modules/profile/kafka/broker_prometheus_jmx_exporter.yaml',
-            owner   => 'kafka',
-            group   => 'kafka',
-            mode    => '0400',
-            require => Class['::confluent::kafka::broker'],
-        }
-    } else {
-        $java_opts = undef
-    }
-
     class { '::confluent::kafka::broker':
         log_dirs                         => $log_dirs,
         brokers                          => $config['brokers']['hash'],
@@ -233,7 +200,6 @@
         # https://kafka.apache.org/documentation/#java
         # Note that MetaspaceSize is a Java 8 setting.
         jvm_performance_opts             => '-server -XX:MetaspaceSize=96m 
-XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 
-XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 
-XX:MaxMetaspaceFreeRatio=80',
-        java_opts                        => $java_opts,
         listeners                        => $listeners,
 
         security_inter_broker_protocol   => $security_inter_broker_protocol,
@@ -249,6 +215,15 @@
         message_max_bytes                => $message_max_bytes,
     }
 
+    # If monitoring is enabled, then include the monitoring profile and set 
$java_opts
+    # for exposing the Prometheus JMX Exporter in the Kafka Broker process.
+    if $monitoring_enabled {
+        include ::profile::kafka::broker::monitoring
+        Class['::confluent::kafka::broker'] {
+            java_opts => $::profile::kafka::broker::monitoring::java_opts
+        }
+    }
+
     $ferm_plaintext_ensure = $plaintext ? {
         false => 'absent',
         undef => 'absent',
diff --git a/modules/profile/manifests/kafka/broker/monitoring.pp 
b/modules/profile/manifests/kafka/broker/monitoring.pp
new file mode 100644
index 0000000..628972b
--- /dev/null
+++ b/modules/profile/manifests/kafka/broker/monitoring.pp
@@ -0,0 +1,73 @@
+# Class: profile::kafka::broker::monitoring
+#
+# Sets up Prometheus based monitoring and icinga alerts.
+#
+class profile::kafka::broker::monitoring (
+    $cluster                        = hiera('cluster'),
+    $prometheus_nodes        = hiera('prometheus_nodes'),
+    $replica_maxlag_warning  = 
hiera('profile::kafka::broker::monitoring::replica_maxlag_warning'),
+    $replica_maxlag_critical = 
hiera('profile::kafka::broker::monitoring::replica_maxlag_critical'),
+) {
+       ### Expose Kafka Broker JMX metrics to Prometheus
+    require_package('prometheus-jmx-exporter')
+
+    $prometheus_jmx_exporter_port = 7800
+    $jmx_exporter_config_file = 
'/etc/kafka/broker_prometheus_jmx_exporter.yaml'
+
+    # Use this in your JAVA_OPTS you pass to the Kafka  broker process
+    $java_opts = 
"-javaagent:/usr/share/java/prometheus/jmx_prometheus_javaagent.jar=${::ipaddress}:${prometheus_jmx_exporter_port}:${jmx_exporter_config_file}"
+
+    # Create the Prometheus JMX Exporter configuration
+    file { $jmx_exporter_config_file:
+        ensure  => present,
+        source  => 
'puppet:///modules/profile/kafka/broker_prometheus_jmx_exporter.yaml',
+        owner   => 'kafka',
+        group   => 'kafka',
+        mode    => '0400',
+        # Require this to make sure that kafka user and group are already 
created.
+        require => Class['::confluent::kafka::broker'],
+    }
+
+    # Allow automatic generation of config on the Prometheus master
+    prometheus::jmx_exporter_instance { $::hostname:
+        address => $::ipaddress,
+        port    => $prometheus_jmx_exporter_port,
+    }
+
+    $prometheus_nodes_ferm = join($prometheus_nodes, ' ')
+    ferm::service { 'kafka-broker-jmx_exporter':
+        proto  => 'tcp',
+        port   => '7800',
+        srange => "@resolve((${prometheus_nodes_ferm}))",
+    }
+
+
+    ### Icinga alerts
+    # Generate icinga alert if Kafka Broker Server is not running.
+    nrpe::monitor_service { 'kafka':
+        description   => 'Kafka Broker Server',
+        nrpe_command  => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java 
-a "Kafka /etc/kafka/server.properties"',
+        critical      => true,
+    }
+
+    # Prometheus labels for this Kafka Broker instance
+    $prometheus_labels = 
"cluster=kafka_${cluster},instance=${::hostname}:${prometheus_jmx_exporter_port},job=jmx_kafka"
+
+    # Alert on the average number of under replicated partitions over the last 
30 minutes.
+    monitoring::check_prometheus { 'kafka_broker_under_replicated_partitions':
+        description => 'Kafka Broker Under Replicated Partitions',
+        metric      => 
"scalar(avg_over_time(kafka_server_replicamanager_underreplicatedpartitions{${prometheus_labels}}[30m]))",
+        warning        => 5,
+        critical       => 10,
+        prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops";,
+    }
+
+    # Alert on the average max replica lag over the last 30 minutes.
+    monitoring::check_prometheus { 'kafka_broker_replica_max_lag':
+        description => 'Kafka Broker Replica Max Lag',
+        metric      => 
"scalar(avg_over_time(kafka_server_replicafetchermanager_maxlag{${prometheus_labels}}[30m]))"
+        warning     => $replica_maxlag_warning,
+        critical    => $replica_maxlag_critical,
+        prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops";,
+    }
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/381489
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I839d5de4110da245f712e23285280c2fd546fe8f
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <ao...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to