Giuseppe Lavagetto has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/382721 )
Change subject: cacheproxy: move some content to new module ...................................................................... cacheproxy: move some content to new module Things that could be logically joined together in classes, but had no real citizenship either in the more-general varnish module or as profiles are now collected in a cacheproxy module, which is thought as a collector of varnish-related puppet resources that are mostly wmf-specific. Also, upgrade the traffic-pool systemd unit definition to use our more modern defined types. Change-Id: I7a6fabafbff4de5bcd8e496a31796ea406015316 --- A modules/cacheproxy/manifests/cron_restart.pp R modules/cacheproxy/manifests/performance.pp A modules/cacheproxy/manifests/traffic_pool.pp R modules/cacheproxy/templates/traffic-pool.service.erb R modules/cacheproxy/templates/varnish-backend-restart.cron.erb M modules/lvs/manifests/kernel_config.pp M modules/profile/manifests/cache/base.pp 7 files changed, 83 insertions(+), 59 deletions(-) Approvals: Giuseppe Lavagetto: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/cacheproxy/manifests/cron_restart.pp b/modules/cacheproxy/manifests/cron_restart.pp new file mode 100644 index 0000000..520c2d6 --- /dev/null +++ b/modules/cacheproxy/manifests/cron_restart.pp @@ -0,0 +1,25 @@ +# == Class cacheproxy::cron_restart +# +# Add a periodic restart every week as a cron job, staggering the time across a +# cluster +# +# === Parameters +# +# [*nodes*] The list of nodes we need to stagger restarts across. +# +class cacheproxy::cron_restart ($nodes, $cache_cluster) { + #TODO: maybe use the list of datacenters to do this? + $all_nodes = array_concat($nodes['eqiad'], $nodes['esams'], $nodes['ulsfo'], $nodes['codfw']) + $times = cron_splay($all_nodes, 'weekly', "${cache_cluster}-backend-restarts") + $be_restart_h = $times['hour'] + $be_restart_m = $times['minute'] + $be_restart_d = $times['weekday'] + + file { '/etc/cron.d/varnish-backend-restart': + mode => '0444', + owner => 'root', + group => 'root', + content => template('cacheproxy/varnish-backend-restart.cron.erb'), + require => File['/usr/local/sbin/varnish-backend-restart'], + } +} diff --git a/modules/role/manifests/cache/perf.pp b/modules/cacheproxy/manifests/performance.pp similarity index 98% rename from modules/role/manifests/cache/perf.pp rename to modules/cacheproxy/manifests/performance.pp index ff2ef3c..8efde2a 100644 --- a/modules/role/manifests/cache/perf.pp +++ b/modules/cacheproxy/manifests/performance.pp @@ -1,8 +1,9 @@ +# == Class cacheproxy::performance +# # This class contains production-specific performance hacks # These should have zero functional effect, they are merely system-level # tweaks to support heavy load/traffic. -class role::cache::perf { - include cpufrequtils # defaults to "performance" +class cacheproxy::performance { # Bump min_free_kbytes to ensure network buffers are available quickly # without having to evict cache on the spot diff --git a/modules/cacheproxy/manifests/traffic_pool.pp b/modules/cacheproxy/manifests/traffic_pool.pp new file mode 100644 index 0000000..165e7f5 --- /dev/null +++ b/modules/cacheproxy/manifests/traffic_pool.pp @@ -0,0 +1,34 @@ +# == Class cacheproxy::traffic_pool +# +# Manages pooling/depooling of servers in conftool +# upon startup/shutdown. +# +# +class cacheproxy::traffic_pool { + $varlib_path = '/var/lib/traffic-pool' + # note: we can't use 'service' because we don't want to 'ensure => + # stopped|running', but we still need to enable it + systemd::unit { 'traffic-pool.service': + ensure => present, + content => template('cacheproxy/traffic-pool.service.erb'), + } + + file { $varlib_path: + ensure => directory, + mode => '0755', + owner => 'root', + group => 'root', + } + + exec { 'systemd enable traffic-pool': + command => '/bin/systemctl enable traffic-pool.service', + unless => '/bin/systemctl is-enabled traffic-pool.service', + require => [Systemd::Unit['traffic-pool.service'],File[$varlib_path]] + } + + nrpe::monitor_systemd_unit_state { 'traffic-pool': + require => Systemd::Unit['traffic-pool.service'], + critical => false, # promote to true once better-tested in the real world + } + +} diff --git a/modules/role/files/cache/traffic-pool.service b/modules/cacheproxy/templates/traffic-pool.service.erb similarity index 100% rename from modules/role/files/cache/traffic-pool.service rename to modules/cacheproxy/templates/traffic-pool.service.erb diff --git a/modules/varnish/templates/varnish-backend-restart.cron.erb b/modules/cacheproxy/templates/varnish-backend-restart.cron.erb similarity index 100% rename from modules/varnish/templates/varnish-backend-restart.cron.erb rename to modules/cacheproxy/templates/varnish-backend-restart.cron.erb diff --git a/modules/lvs/manifests/kernel_config.pp b/modules/lvs/manifests/kernel_config.pp index 2849496..22b177a 100644 --- a/modules/lvs/manifests/kernel_config.pp +++ b/modules/lvs/manifests/kernel_config.pp @@ -36,7 +36,7 @@ 'net.ipv4.vs.schedule_icmp' => 1, # basic netdev tuning for 10GbE interfaces at full speed with RPS. - # See deeper details in role::cache::perf + # See deeper details in cacheproxy::performance 'net.core.netdev_max_backlog' => 300000, 'net.core.netdev_budget' => 1024, 'net.core.netdev_tstamp_prequeue' => 0, diff --git a/modules/profile/manifests/cache/base.pp b/modules/profile/manifests/cache/base.pp index d3e05a5..3ef0e1b 100644 --- a/modules/profile/manifests/cache/base.pp +++ b/modules/profile/manifests/cache/base.pp @@ -1,7 +1,7 @@ # === Class profile::cache::base # # Sets up some common things for cache instances: -# - LVS/conftool +# - conftool # - monitoring # - logging/analytics # - storage @@ -14,6 +14,15 @@ $purge_host_not_upload_re = hiera('profile::cache::base::purge_host_not_upload_re'), $storage_parts = hiera('profile::cache::base::purge_host_not_upload_re'), ) { + # There is no better way to do this, so it can't be a class parameter. In fact, + # I consider our requirement to make hiera calls parameters + # harmful, as it prevents us to do hiera key interpolation in + # subsequent hiera calls, but we did this because of the way the + # WM Cloud puppet UI works. meh. So, just disable the linter here. + # lint:ignore:wmf_styleguide + $nodes = hiera("cache::${cache_cluster}::nodes") + # lint:endignore + # Needed profiles require ::profile::conftool::client require ::profile::prometheus::varnish_exporter @@ -28,32 +37,21 @@ class { 'conftool::scripts': } - # Only production needs system perf tweaks if $::realm == 'production' { - include role::cache::perf + # Only production needs system perf tweaks + class { '::cpufrequtils': } + class { 'cacheproxy::performance': } + + # Periodic cron restarts, we need this to mitigate T145661 + class { 'cacheproxy::cron_restart': + nodes => $nodes, + cache_cluster => $cache_cluster, + } } # Not ideal factorization to put this here, but works for now class { 'varnish::zero_update': site => $zero_site, - } - - # XXX: temporary, we need this to mitigate T145661 - if $::realm == 'production' { - $hnodes = hiera("cache::${cache_cluster}::nodes") - $all_nodes = array_concat($hnodes['eqiad'], $hnodes['esams'], $hnodes['ulsfo'], $hnodes['codfw']) - $times = cron_splay($all_nodes, 'weekly', "${cache_cluster}-backend-restarts") - $be_restart_h = $times['hour'] - $be_restart_m = $times['minute'] - $be_restart_d = $times['weekday'] - - file { '/etc/cron.d/varnish-backend-restart': - mode => '0444', - owner => 'root', - group => 'root', - content => template('varnish/varnish-backend-restart.cron.erb'), - require => File['/usr/local/sbin/varnish-backend-restart'], - } } ########################################################################### @@ -64,42 +62,8 @@ statsd_host => $statsd_host, } - ########################################################################### # auto-depool on shutdown + conditional one-shot auto-pool on start - # note: we can't use 'service' because we don't want to 'ensure => - # stopped|running', and 'service_unit' with 'declare_service => false' - # wouldn't enable the service in systemd terms, either. - ########################################################################### - - $tp_unit_path = '/lib/systemd/system/traffic-pool.service' - $varlib_path = '/var/lib/traffic-pool' - - file { $tp_unit_path: - ensure => present, - source => 'puppet:///modules/role/cache/traffic-pool.service', - mode => '0444', - owner => 'root', - group => 'root', - } - - file { $varlib_path: - ensure => directory, - mode => '0755', - owner => 'root', - group => 'root', - } - - exec { 'systemd reload+enable for traffic-pool': - refreshonly => true, - command => '/bin/systemctl daemon-reload && /bin/systemctl enable traffic-pool', - subscribe => File[$tp_unit_path], - require => File[$varlib_path], - } - - nrpe::monitor_systemd_unit_state { 'traffic-pool': - require => File[$tp_unit_path], - critical => false, # promote to true once better-tested in the real world - } + class { 'cacheproxy::traffic_pool': } ########################################################################### -- To view, visit https://gerrit.wikimedia.org/r/382721 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I7a6fabafbff4de5bcd8e496a31796ea406015316 Gerrit-PatchSet: 5 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Giuseppe Lavagetto <glavage...@wikimedia.org> Gerrit-Reviewer: Ema <e...@wikimedia.org> Gerrit-Reviewer: Giuseppe Lavagetto <glavage...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits