Ottomata has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/369724 )

Change subject: Update kafka server.properties to work with defaults
......................................................................


Update kafka server.properties to work with defaults

This mostly just makes server.properties look closer to what is
provided from upstream, and lets Kafka pick default values
if they are not overridden by puppet.

To ensure no-ops on prod Kafka brokers, the analytics and main
kafka broker roles manually set the parameters in question,
even though they are the Kafka defaults.

Bug: T166162
Change-Id: I3881cfc82b6605098f3389533596983550c79157
---
M hieradata/role/common/kafka/jumbo/broker.yaml
M hieradata/role/common/kafka/simple/broker.yaml
M modules/confluent/manifests/kafka/broker.pp
M modules/confluent/templates/kafka/server.properties.erb
M modules/profile/manifests/kafka/broker.pp
M modules/role/manifests/kafka/analytics/broker.pp
M modules/role/manifests/kafka/jumbo/broker.pp
M modules/role/manifests/kafka/main/broker.pp
8 files changed, 100 insertions(+), 75 deletions(-)

Approvals:
  Ottomata: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/hieradata/role/common/kafka/jumbo/broker.yaml 
b/hieradata/role/common/kafka/jumbo/broker.yaml
index 34c0839..c8215ca 100644
--- a/hieradata/role/common/kafka/jumbo/broker.yaml
+++ b/hieradata/role/common/kafka/jumbo/broker.yaml
@@ -20,8 +20,13 @@
 profile::kafka::broker::auto_leader_rebalance_enable: true
 profile::kafka::broker::nofiles_ulimit: 65536
 
+profile::kafka::broker::log_retention_hours: 168  # 1 week
+
 # Bump this up to get a little more parallelism between replicas.
 profile::kafka::broker::num_replica_fetchers: 12
-profile::kafka::broker::log_retention_hours: 168  # 1 week
+# 12 disks in the broker HW RAID array
+profile::kafka::broker::num_recovery_threads_per_data_dir: 12
+profile::kafka::broker::num_io_threads: 12
+
 profile::kafka::broker::replica_maxlag_warning: "1000000"
 profile::kafka::broker::replica_maxlag_critical: "5000000"
diff --git a/hieradata/role/common/kafka/simple/broker.yaml 
b/hieradata/role/common/kafka/simple/broker.yaml
index 92aafc6..0ac2c22 100644
--- a/hieradata/role/common/kafka/simple/broker.yaml
+++ b/hieradata/role/common/kafka/simple/broker.yaml
@@ -11,3 +11,5 @@
 profile::kafka::broker::log_retention_hours: 168  # 1 week
 profile::kafka::broker::replica_maxlag_warning: "1000"
 profile::kafka::broker::replica_maxlag_critical: "10000"
+profile::kafka::broker::num_recovery_threads_per_data_dir: 1
+profile::kafka::broker::num_io_threads: 1
\ No newline at end of file
diff --git a/modules/confluent/manifests/kafka/broker.pp 
b/modules/confluent/manifests/kafka/broker.pp
index 4b41de4..a6ca126 100644
--- a/modules/confluent/manifests/kafka/broker.pp
+++ b/modules/confluent/manifests/kafka/broker.pp
@@ -221,21 +221,22 @@
     $log_dirs                            = ['/var/spool/kafka'],
 
     $zookeeper_connect                   = 'localhost:2181',
-    $zookeeper_connection_timeout_ms     = 6000,
-    $zookeeper_session_timeout_ms        = 6000,
+    $zookeeper_connection_timeout_ms     = undef,
+    $zookeeper_session_timeout_ms        = undef,
 
     $auto_create_topics_enable           = true,
     $auto_leader_rebalance_enable        = true,
 
     $num_partitions                      = 1,
     $default_replication_factor          = 1,
+    $offsets_topic_replication_factor    = undef,
     $min_insync_replicas                 = 1,
     $replica_lag_time_max_ms             = undef,
     $num_recovery_threads_per_data_dir   = undef,
     $replica_socket_timeout_ms           = undef,
     $replica_socket_receive_buffer_bytes = undef,
     $num_replica_fetchers                = 1,
-    $replica_fetch_max_bytes             = 1048576,
+    $replica_fetch_max_bytes             = undef,
 
     $num_network_threads                 = undef,
     $num_io_threads                      = size($log_dirs),
@@ -243,16 +244,15 @@
     $socket_receive_buffer_bytes         = 1048576,
     $socket_request_max_bytes            = undef,
 
-    # TODO: Tune these defaults?
-    $log_flush_interval_messages         = 10000,
-    $log_flush_interval_ms               = 1000,
+    $log_flush_interval_messages         = undef,
+    $log_flush_interval_ms               = undef,
 
     $log_retention_hours                 = 168,     # 1 week
     $log_retention_bytes                 = undef,
     $log_segment_bytes                   = undef,
 
     $log_retention_check_interval_ms     = undef,
-    $log_cleanup_policy                  = 'delete',
+    $log_cleanup_policy                  = undef,
 
     $offsets_retention_minutes           = 10080,   # 1 week
 
diff --git a/modules/confluent/templates/kafka/server.properties.erb 
b/modules/confluent/templates/kafka/server.properties.erb
index 2801edd..34e654a 100644
--- a/modules/confluent/templates/kafka/server.properties.erb
+++ b/modules/confluent/templates/kafka/server.properties.erb
@@ -96,6 +96,12 @@
 # Default to the number of brokers in this cluster.
 default.replication.factor=<%= @default_replication_factor %>
 
+<% if @offsets_topic_replication_factor -%>
+# The replication factor for the group metadata internal topics 
"__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is 
recommended for to ensure availability such as 3.
+offsets.topic.replication.factor=<%= @offsets_topic_replication_factor %>
+
+<% end -%>
 # When a producer sets acks to "all" (or "-1"), min.insync.replicas specifies 
the minimum number of
 # replicas that must acknowledge a write for the write to be considered 
successful. If this minimum
 # cannot be met, then the producer will raise an exception (either 
NotEnoughReplicas or
@@ -154,22 +160,17 @@
 <% if @log_flush_interval_messages or @log_flush_interval_ms -%>
 ############################# Log Flush Policy #############################
 
-# Messages are immediately written to the filesystem but by default we only
-# fsync() to sync the OS cache lazily. The following configurations control the
-# flush of data to disk.
+# Messages are immediately written to the filesystem but by default we only 
fsync() to sync
+# the OS cache lazily. The following configurations control the flush of data 
to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using 
replication.
-#    2. Latency: Very large flush intervals may lead to latency spikes when the
-#       flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a
-#       small flush interval may lead to exceessive seeks.
-# The settings below allow one to configure the flush policy to flush data
-# after a period of time or every N messages (or both). This can be done
-# globally and overridden on a per-topic basis.
+#    2. Latency: Very large flush intervals may lead to latency spikes when 
the flush does occur as there will be a lot of data to flush.
+#    3. Throughput: The flush is generally the most expensive operation, and a 
small flush interval may lead to exceessive seeks.
+# The settings below allow one to configure the flush policy to flush data 
after a period of time or
+# every N messages (or both). This can be done globally and overridden on a 
per-topic basis.
 
 <% if @log_flush_interval_messages -%>
-# The number of messages accumulated on a log partition before messages
-# are flushed to disk.
+# The number of messages to accept before forcing a flush of data to disk
 log.flush.interval.messages=<%= @log_flush_interval_messages %>
 
 <% end -%>
@@ -195,25 +196,24 @@
 # criteria are met. Deletion always happens from the end of the log.
 
 <% if @log_retention_hours -%>
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=<%= @log_retention_hours %>
 
 <% end -%>
 <% if @log_retention_bytes -%>
-# A size-based retention policy for logs. Segments are pruned from the log as
-# long as the remaining segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log as 
long as the remaining
+# segments don't drop below log.retention.bytes. Functions independently of 
log.retention.hours.
 log.retention.bytes=<%= @log_retention_bytes %>
 
 <% end -%>
 <% if @log_segment_bytes -%>
-# The maximum size of a log segment file. When this size is reached a new log
-# segment will be created.
+# The maximum size of a log segment file. When this size is reached a new log 
segment will be created.
 log.segment.bytes=<%= @log_segment_bytes %>
 
 <% end -%>
 <% if @log_retention_check_interval_ms -%>
-# The interval at which log segments are checked to see if they can be deleted
-# according to the retention policies
+# The interval at which log segments are checked to see if they can be deleted 
according
+# to the retention policies
 log.retention.check.interval.ms=<%= @log_retention_check_interval_ms %>
 
 <% end -%>
@@ -242,8 +242,7 @@
 # root directory for all kafka znodes.
 zookeeper.connect=<%= @zookeeper_connect %>
 
-# The maximum amount of time in ms that the client waits to establish a
-# connection to zookeeper
+# Timeout in ms for connecting to zookeeper
 zookeeper.connection.timeout.ms=<%= @zookeeper_connection_timeout_ms %>
 
 # Zookeeper session timeout. If the server fails to heartbeat to Zookeeper
@@ -253,10 +252,8 @@
 zookeeper.session.timeout.ms=<%= @zookeeper_session_timeout_ms %>
 
 ##################### Confluent Proactive Support ######################
-
-# If set to true, then the feature to collect and report support metrics
-# ("Metrics") is enabled.  If set to false, the feature is disabled.
-#
+# If set to true, and confluent-support-metrics package is installed
+# then the feature to collect and report support metrics
 confluent.support.metrics.enable=false
 
 # The customer ID under which support metrics will be collected and
diff --git a/modules/profile/manifests/kafka/broker.pp 
b/modules/profile/manifests/kafka/broker.pp
index bf10b4a..fb353be 100644
--- a/modules/profile/manifests/kafka/broker.pp
+++ b/modules/profile/manifests/kafka/broker.pp
@@ -59,20 +59,22 @@
 #   Hiera: profile::kafka::broker::replica_maxlag_critical
 #
 class profile::kafka::broker(
-    $kafka_cluster_name           = hiera('kafka_cluster_name'),
-    $statsd                       = hiera('statsd'),
+    $kafka_cluster_name                = hiera('kafka_cluster_name'),
+    $statsd                            = hiera('statsd'),
 
-    $plaintext                    = hiera('profile::kafka::broker::plaintext'),
-    # $tls_secrets_path             = 
hiera('profile::kafka::broker::tls_secrets_path'),
-    # $tls_key_password             = 
hiera('profile::kafka::broker::tls_key_password'),
+    $plaintext                         = 
hiera('profile::kafka::broker::plaintext'),
+    # $tls_secrets_path                = 
hiera('profile::kafka::broker::tls_secrets_path'),
+    # $tls_key_password                = 
hiera('profile::kafka::broker::tls_key_password'),
 
-    $log_dirs                     = hiera('profile::kafka::broker::log_dirs'),
-    $auto_leader_rebalance_enable = 
hiera('profile::kafka::broker::auto_leader_rebalance_enable'),
-    $log_retention_hours          = 
hiera('profile::kafka::broker::log_retention_hours'),
-    $num_replica_fetchers         = 
hiera('profile::kafka::broker::num_replica_fetchers'),
-    $nofiles_ulimit               = 
hiera('profile::kafka::broker::nofiles_ulimit'),
-    $replica_maxlag_warning       = 
hiera('profile::kafka::broker::replica_maxlag_warning'),
-    $replica_maxlag_critical      = 
hiera('profile::kafka::broker::replica_maxlag_critical'),
+    $log_dirs                          = 
hiera('profile::kafka::broker::log_dirs'),
+    $auto_leader_rebalance_enable      = 
hiera('profile::kafka::broker::auto_leader_rebalance_enable'),
+    $log_retention_hours               = 
hiera('profile::kafka::broker::log_retention_hours'),
+    $num_recovery_threads_per_data_dir = 
hiera('profile::kafka::broker::num_recovery_threads_per_data_dir'),
+    $num_io_threads                    = 
hiera('profile::kafka::broker::num_io_threads'),
+    $num_replica_fetchers              = 
hiera('profile::kafka::broker::num_replica_fetchers'),
+    $nofiles_ulimit                    = 
hiera('profile::kafka::broker::nofiles_ulimit'),
+    $replica_maxlag_warning            = 
hiera('profile::kafka::broker::replica_maxlag_warning'),
+    $replica_maxlag_critical           = 
hiera('profile::kafka::broker::replica_maxlag_critical'),
 ) {
     # TODO: WIP
     $tls_secrets_path = undef
@@ -150,32 +152,32 @@
     }
 
     class { '::confluent::kafka::broker':
-        log_dirs                       => $log_dirs,
-        brokers                        => $config['brokers']['hash'],
-        zookeeper_connect              => $config['zookeeper']['url'],
-        nofiles_ulimit                 => $nofiles_ulimit,
-        default_replication_factor     => min(3, $config['brokers']['size']),
-
+        log_dirs                         => $log_dirs,
+        brokers                          => $config['brokers']['hash'],
+        zookeeper_connect                => $config['zookeeper']['url'],
+        nofiles_ulimit                   => $nofiles_ulimit,
+        default_replication_factor       => min(3, $config['brokers']['size']),
+        offsets_topic_replication_factor => min(3,  
$config['brokers']['size']),
         # TODO: This can be removed once it is a default
         # in ::confluent::kafka module
-        inter_broker_protocol_version  => '0.11.0',
+        inter_broker_protocol_version    => '0.11.0',
         # Use Kafka/LinkedIn recommended settings with G1 garbage collector.
         # https://kafka.apache.org/documentation/#java
         # Note that MetaspaceSize is a Java 8 setting.
-        jvm_performance_opts           => '-server -XX:MetaspaceSize=96m 
-XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 
-XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 
-XX:MaxMetaspaceFreeRatio=80',
+        jvm_performance_opts             => '-server -XX:MetaspaceSize=96m 
-XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 
-XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 
-XX:MaxMetaspaceFreeRatio=80',
 
-        listeners                      => $listeners,
+        listeners                        => $listeners,
 
-        security_inter_broker_protocol => $security_inter_broker_protocol,
-        ssl_keystore_location          => $ssl_keystore_location,
-        ssl_keystore_password          => $ssl_keystore_password,
-        ssl_key_password               => $ssl_key_password,
-        ssl_truststore_location        => $ssl_truststore_location,
-        ssl_truststore_password        => $ssl_truststore_password,
-        ssl_client_auth                => $ssl_client_auth,
+        security_inter_broker_protocol   => $security_inter_broker_protocol,
+        ssl_keystore_location            => $ssl_keystore_location,
+        ssl_keystore_password            => $ssl_keystore_password,
+        ssl_key_password                 => $ssl_key_password,
+        ssl_truststore_location          => $ssl_truststore_location,
+        ssl_truststore_password          => $ssl_truststore_password,
+        ssl_client_auth                  => $ssl_client_auth,
 
-        auto_leader_rebalance_enable   => $auto_leader_rebalance_enable,
-        num_replica_fetchers           => $num_replica_fetchers,
+        auto_leader_rebalance_enable     => $auto_leader_rebalance_enable,
+        num_replica_fetchers             => $num_replica_fetchers,
     }
 
     class { '::confluent::kafka::broker::jmxtrans':
diff --git a/modules/role/manifests/kafka/analytics/broker.pp 
b/modules/role/manifests/kafka/analytics/broker.pp
index b602910..5a9fcbc 100644
--- a/modules/role/manifests/kafka/analytics/broker.pp
+++ b/modules/role/manifests/kafka/analytics/broker.pp
@@ -103,6 +103,14 @@
         log_retention_hours             => 
hiera('confluent::kafka::broker::log_retention_hours', 168),
         # Use LinkedIn recommended settings with G1 garbage collector,
         jvm_performance_opts            => '-server -XX:PermSize=48m 
-XX:MaxPermSize=48m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 
-XX:InitiatingHeapOccupancyPercent=35',
+
+        # These defaults are set to keep no-ops from changes
+        # made in confluent module for T166162.
+        # They should be removed (since they are the kafka or module defaults)
+        # when this role gets converted to a profile.
+        replica_fetch_max_bytes         => 1048576,
+        log_flush_interval_messages     => 10000,
+        log_cleanup_policy              => 'delete',
     }
 
     class { '::confluent::kafka::broker::jmxtrans':
diff --git a/modules/role/manifests/kafka/jumbo/broker.pp 
b/modules/role/manifests/kafka/jumbo/broker.pp
index 71920b2..36ba275 100644
--- a/modules/role/manifests/kafka/jumbo/broker.pp
+++ b/modules/role/manifests/kafka/jumbo/broker.pp
@@ -1,8 +1,8 @@
-# == Class role::kafka::aggregate::broker
+# == Class role::kafka::jumbo::broker
 # Sets up a Kafka broker in the 'jumbo' Kafka cluster.
 #
 class role::kafka::jumbo::broker {
-    system::role { 'role::kafka::aggregate::broker':
+    system::role { 'role::kafka::jumbo::broker':
         description => "Kafka Broker in an 'jumbo' Kafka cluster",
     }
 
diff --git a/modules/role/manifests/kafka/main/broker.pp 
b/modules/role/manifests/kafka/main/broker.pp
index 08d6ade..df7fad3 100644
--- a/modules/role/manifests/kafka/main/broker.pp
+++ b/modules/role/manifests/kafka/main/broker.pp
@@ -36,27 +36,38 @@
     }
 
     class { '::confluent::kafka::broker':
-        log_dirs                      => ['/srv/kafka/data'],
-        brokers                       => $config['brokers']['hash'],
-        zookeeper_connect             => $config['zookeeper']['url'],
-        nofiles_ulimit                => $nofiles_ulimit,
-        jmx_port                      => $config['jmx_port'],
+        log_dirs                        => ['/srv/kafka/data'],
+        brokers                         => $config['brokers']['hash'],
+        zookeeper_connect               => $config['zookeeper']['url'],
+        nofiles_ulimit                  => $nofiles_ulimit,
+        jmx_port                        => $config['jmx_port'],
 
         # I don't trust auto.leader.rebalance :)
-        auto_leader_rebalance_enable  => false,
+        auto_leader_rebalance_enable    => false,
 
-        default_replication_factor    => $replication_factor,
+        default_replication_factor      => $replication_factor,
 
         # Should be changed if brokers are upgraded.
-        inter_broker_protocol_version => '0.9.0.X',
+        inter_broker_protocol_version   => '0.9.0.X',
 
         # Start with a low number of (auto created) partitions per
         # topic.  This can be increased manually for high volume
         # topics if necessary.
-        num_partitions                => 1,
+        num_partitions                  => 1,
 
         # Use LinkedIn recommended settings with G1 garbage collector.
-        jvm_performance_opts          => '-server -XX:PermSize=48m 
-XX:MaxPermSize=48m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 
-XX:InitiatingHeapOccupancyPercent=35',
+        jvm_performance_opts            => '-server -XX:PermSize=48m 
-XX:MaxPermSize=48m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 
-XX:InitiatingHeapOccupancyPercent=35',
+
+        # These defaults are set to keep no-ops from changes
+        # made in confluent module for T166162.
+        # They should be removed (since they are the kafka or module defaults)
+        # when this role gets converted to a profile.
+        replica_fetch_max_bytes         => 1048576,
+        log_flush_interval_messages     => 10000,
+        log_flush_interval_ms           => 1000,
+        log_cleanup_policy              => 'delete',
+        zookeeper_connection_timeout_ms => 6000,
+        zookeeper_session_timeout_ms    => 6000,
     }
 
     # Include Kafka Broker Jmxtrans class to

-- 
To view, visit https://gerrit.wikimedia.org/r/369724
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I3881cfc82b6605098f3389533596983550c79157
Gerrit-PatchSet: 2
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Giuseppe Lavagetto <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to