Ottomata has uploaded a new change for review. https://gerrit.wikimedia.org/r/168147
Change subject: Remove all kraken references ...................................................................... Remove all kraken references The kraken repository (and previously named cluster) have been deprecated and are no longer supported or used. Change-Id: I6f2900f2a4643a8fce98b3b637f3dd82ef1d1043 --- D files/ganglia/plugins/kraken_webrequest_loss.py D files/ganglia/plugins/kraken_webrequest_loss.pyconf M manifests/misc/monitoring.pp M manifests/role/analytics.pp D manifests/role/analytics/kraken.pp M manifests/role/analytics/refinery.pp M manifests/role/deployment.pp M manifests/site.pp M modules/contint/manifests/packages.pp 9 files changed, 3 insertions(+), 312 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/47/168147/1 diff --git a/files/ganglia/plugins/kraken_webrequest_loss.py b/files/ganglia/plugins/kraken_webrequest_loss.py deleted file mode 100644 index 8b966a8..0000000 --- a/files/ganglia/plugins/kraken_webrequest_loss.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" - Python Gmond Module for Kraken Webrequest Loss Percentage. - Loss percentage per source host data is generated by the packetloss - Oozie job in Kraken. - - :copyright: (c) 2012 Wikimedia Foundation - :author: Andrew Otto <[email protected]> - :license: GPL - -""" -from __future__ import print_function - -import logging -import commands - -UPDATE_INTERVAL = 3600 # seconds - -# Config for multiple metrics. -# Currently we only compute a single webrequest loss -# percentage, but this allows us to add more later. -metrics = { - 'webrequest_loss_average': { - 'description': 'Average Webrequest Loss Percentage', - 'path': '/wmf/data/webrequest/loss', - } -} - -def latest_loss_path(metric_name): - """Returns HDFS path to the most recently generated webrequest loss data.""" - logging.debug("latest_loss_path(%s)" % metrics[metric_name]['path']) - return commands.getoutput("/usr/bin/hadoop fs -ls %s | /usr/bin/tail -n 1 | /usr/bin/awk '{print $NF}'" % (metrics[metric_name]['path'])) - -def loss_data(loss_path): - """Returns the output data inside the HDFS loss_path.""" - logging.debug("loss_data(%s)" % loss_path) - return commands.getoutput("/usr/bin/hadoop fs -cat %s/part*" % (loss_path)) - -def loss_average(loss_data): - """Parses loss_data for loss percentages and averages them all.""" - logging.debug("loss_average(%s)" % loss_data) - percent_sum = 0.0 - loss_lines = loss_data.split("\n") - for line in loss_lines: - fields = line.split("\t") - percent = fields[-1] - percent_sum += float(percent) - - average_percent = (percent_sum / float(len(loss_lines))) - return average_percent - -def metric_handler(name): - """Get value of particular metric; part of Gmond interface""" - logging.debug('metric_handler(): %s', name) - return loss_average(loss_data(latest_loss_path(name))) - -def metric_init(params): - global descriptors - - descriptors = [] - for metric_name, metric_config in metrics.items(): - descriptors.append({ - 'name': metric_name, - 'call_back': metric_handler, - 'time_max': 3660, - 'value_type': 'float', - 'units': '%', - 'slope': 'both', - 'format': '%f', - 'description': metric_config['description'], - 'groups': 'analytics' - }) - - return descriptors - - -def metric_cleanup(): - """Teardown; part of Gmond interface""" - pass - - -if __name__ == '__main__': - # When invoked as standalone script, run a self-test by querying each - # metric descriptor and printing it out. - logging.basicConfig(level=logging.DEBUG) - for metric in metric_init({}): - value = metric['call_back'](metric['name']) - print(( "%s => " + metric['format'] ) % ( metric['name'], value )) diff --git a/files/ganglia/plugins/kraken_webrequest_loss.pyconf b/files/ganglia/plugins/kraken_webrequest_loss.pyconf deleted file mode 100644 index 2ea2fea..0000000 --- a/files/ganglia/plugins/kraken_webrequest_loss.pyconf +++ /dev/null @@ -1,20 +0,0 @@ -# Gmond configuration for calculating -# webrequest data loss stored in HDFS in Kraken. - -modules { - module { - name = "kraken_webrequest_loss" - language = "python" - } -} - -collection_group { - collect_every = 3600 - time_threshold = 3660 - - metric { - name = "webrequest_loss_average" - title = "Average Loss Percentage" - value_threshold = 0 - } -} diff --git a/manifests/misc/monitoring.pp b/manifests/misc/monitoring.pp index b4fa29b..e27147a 100644 --- a/manifests/misc/monitoring.pp +++ b/manifests/misc/monitoring.pp @@ -64,37 +64,6 @@ } } -# == Class misc::monitoring::kraken::loss -# Checks recently generated webrequest loss statistics in -# Kraken HDFS and sends the average loss percentage to ganglia. -# -class misc::monitoring::kraken::loss { - file { - '/usr/lib/ganglia/python_modules/kraken_webrequest_loss.py': - require => File['/usr/lib/ganglia/python_modules'], - source => 'puppet:///files/ganglia/plugins/kraken_webrequest_loss.py', - notify => Service['gmond']; - '/etc/ganglia/conf.d/udp_stats.pyconf': - require => File['/usr/lib/ganglia/python_modules/kraken_webrequest_loss.py'], - source => 'puppet:///files/ganglia/plugins/kraken_webrequest_loss.pyconf', - notify => Service['gmond']; - } - - # Set up icinga monitoring of Kraken HDFS data loss. - monitor_service { 'kraken_webrequest_loss_average_positive': - description => 'webrequest_loss_average_positive', - check_command => 'check_kraken_webrequest_loss_positive!2!8', - contact_group => 'analytics', - } - # It is possible to have negative data loss. This would mean that - # we are receiving duplicates log lines. We need alerts for this too. - monitor_service { 'kraken_webrequest_loss_average_negative': - description => 'webrequest_loss_average_negative', - check_command => 'check_kraken_webrequest_loss_negative!-2!-8', - contact_group => 'analytics', - } -} - # Ganglia views that should be # avaliable on ganglia.wikimedia.org class misc::monitoring::views { diff --git a/manifests/role/analytics.pp b/manifests/role/analytics.pp index d54501c..24d7e33 100644 --- a/manifests/role/analytics.pp +++ b/manifests/role/analytics.pp @@ -18,9 +18,6 @@ # == Class role::analytics::clients # Includes common client classes for # working with hadoop and other analytics services. -# This class is often included by including -# role::analytics::kraken, but you may include -# it on its own if you don't need any kraken code. class role::analytics::clients { include role::analytics diff --git a/manifests/role/analytics/kraken.pp b/manifests/role/analytics/kraken.pp deleted file mode 100644 index e390cec..0000000 --- a/manifests/role/analytics/kraken.pp +++ /dev/null @@ -1,148 +0,0 @@ -# kraken.pp - role classes dealing with Kraken data analysis. -# -# NOTE! 'kraken' will be renamed soon. - -# == Class role::analytics::kraken -# Kraken refers to the Analytics codebase used to generate -# analytics for WMF. -class role::analytics::kraken { - # Need Hadoop client classes included to use Kraken. - include role::analytics::clients - - # We want to be able to geolocate IP addresses - include geoip - # udp-filter is a useful thing! - include misc::udp2log::udp_filter - - # many Kraken python scripts use docopt for CLI parsing. - if !defined(Package['python-docopt']) { - package { 'python-docopt': - ensure => 'installed', - } - } - - # Many kraken jobs use dclass for - # User Agent Device classification - package { 'libdclass-java': - ensure => 'installed', - } - - # Include kraken/deploy repository target. - package { 'analytics/kraken/deploy': - provider => 'trebuchet', - } - # kraken/deploy repository is deployed via git deploy into here. - # You must deploy this yourself, puppet will not do it for you. - $path = '/srv/deployment/analytics/kraken/deploy/kraken' - - # Path in HDFS in which external data should be imported. - $external_data_hdfs_dir = '/wmf/data/external' - - # Create directory in /var/log for general purpose Kraken job logging. - $log_dir = '/var/log/kraken' - file { $log_dir: - ensure => 'directory', - owner => 'root', - group => 'stats', - # setgid bit here to make kraken log files writeable - # by users in the stats group. - mode => '2775', - } - -} - -# == Class role::analytics::kraken::jobs::import::kafka -# Submits Camus MapReduce jobs to import data from Kafka. -class role::analytics::kraken::jobs::import::kafka { - require role::analytics::kraken - - $camus_webrequest_properties = "${::role::analytics::kraken::path}/kraken-etl/conf/camus.webrequest.properties" - $camus_webrequest_log_file = "${::role::analytics::kraken::log_dir}/camus-webrequest.log" - cron { 'kraken-import-hourly-webrequest': - command => "${::role::analytics::kraken::path}/kraken-etl/camus --job-name camus-webrequest-import ${camus_webrequest_properties} >> ${camus_webrequest_log_file} 2>&1", - user => 'hdfs', # we might want to use a different user for this, not sure. - minute => '*/10', - } - - $camus_eventlogging_properties = "${::role::analytics::kraken::path}/kraken-etl/conf/camus.eventlogging.properties" - $camus_eventlogging_log_file = "${::role::analytics::kraken::log_dir}/camus-eventlogging.log" - cron { 'kraken-import-hourly-eventlogging': - command => "${::role::analytics::kraken::path}/kraken-etl/camus --job-name camus-eventlogging-import ${camus_eventlogging_properties} >> ${camus_eventlogging_log_file} 2>&1", - user => 'hdfs', # we might want to use a different user for this, not sure. - minute => '*/10', - } -} - -# == Class role::analytics::kraken::import::pagecounts -# Handles importing of hourly pagecount statistics into -# HDFS and creating Hive partition tables. -class role::analytics::kraken::jobs::import::pagecounts { - include role::analytics::kraken - - $script = "${role::analytics::kraken::path}/kraken-etl/pagecount-importer" - $datadir = $role::analytics::kraken::external_data_hdfs_dir - - # Don't attempt to import anything before this date. - # This imports everything since August 1 2013. - $start_date = '2013.07.31_23' - - # Note: I'm not worried about logrotate yet. - # This generates just a few lines per hour. - $log_file = "${role::analytics::kraken::log_dir}/pagecount-importer.log" - - # make sure the script has been deployed. - exec { "${script}-exists": - command => "/usr/bin/test -f ${script}", - # This exec doesn't actually create $script, but - # we don't need to run test -f it puppet can already - # tell that the file exists. - creates => $script, - } - - # cron job to download any missing pagecount files from - # dumps.wikimedia.org and store them into HDFS. - cron { 'kraken-import-hourly-pagecounts': - command => "${script} --start ${start_date} ${datadir} >> ${log_file} 2>&1", - user => 'hdfs', - minute => 5, - require => Exec["${script}-exists"], - } -} - -# == Class role::analytics::kraken::hive::partitions::external -# Installs cron job that creates external Hive partitions for imported -# datasets in $external_data_hdfs_dir. -class role::analytics::kraken::jobs::hive::partitions::external { - include role::analytics::kraken - - $script = "${role::analytics::kraken::path}/kraken-etl/hive-partitioner" - $datadir = $role::analytics::kraken::external_data_hdfs_dir - $database = 'wmf' - - # We are only using hive-partition to add partitions to the pagecounts table. - # The webrequest table is using Oozie. - $tables = 'pagecounts' - - # Note: I'm not worried about logrotate yet. - # This generates just a few lines per hour. - $log_file = "${role::analytics::kraken::log_dir}/hive-partitioner.log" - - # make sure the script has been deployed. - exec { "${script}-exists": - command => "/usr/bin/test -x ${script}", - # This exec doesn't actually create $script, but - # we don't need to run test it puppet can already - # tell that the file exists. - creates => $script, - } - - - # cron job to automatically create hive partitions for any - # newly imported data. - cron { 'kraken-create-external-hive-partitions': - command => "${script} --database ${database} --tables ${tables} ${datadir} >> ${log_file} 2>&1", - user => 'hdfs', - minute => 21, - require => Exec["${script}-exists"], - } -} diff --git a/manifests/role/analytics/refinery.pp b/manifests/role/analytics/refinery.pp index 2db7b20..532efad 100644 --- a/manifests/role/analytics/refinery.pp +++ b/manifests/role/analytics/refinery.pp @@ -3,7 +3,7 @@ # and using the analytics/refinery repository. # class role::analytics::refinery { - # Many Kraken python scripts use docopt for CLI parsing. + # Some refinery python scripts use docopt for CLI parsing. if !defined(Package['python-docopt']) { package { 'python-docopt': ensure => 'installed', diff --git a/manifests/role/deployment.pp b/manifests/role/deployment.pp index 8abe908..adf33df 100644 --- a/manifests/role/deployment.pp +++ b/manifests/role/deployment.pp @@ -77,11 +77,6 @@ 'gitfat_enabled' => true, 'upstream' => 'https://gerrit.wikimedia.org/r/operations/software/elasticsearch/plugins', }, - 'analytics/kraken/deploy' => { - 'gitfat_enabled' => true, - 'checkout_submodules' => true, - 'upstream' => 'https://gerrit.wikimedia.org/r/p/analytics/kraken/deploy', - }, 'analytics/refinery' => { 'gitfat_enabled' => true, 'upstream' => 'https://gerrit.wikimedia.org/r/analytics/refinery', diff --git a/manifests/site.pp b/manifests/site.pp index 576a96e..311c159 100644 --- a/manifests/site.pp +++ b/manifests/site.pp @@ -254,10 +254,8 @@ include role::logging::udp2log::misc } -# analytics1027 hosts the frontend -# interfaces to Kraken and Hadoop. -# (Hue, Oozie, Hive, etc.). It -# also submits regularly scheduled +# analytics1027 hosts some frontend web interfaces to Hadoop +# (Hue, Oozie, Hive, etc.). It also submits regularly scheduled # batch Hadoop jobs. node 'analytics1027.eqiad.wmnet' { $nagios_group = 'analytics_eqiad' diff --git a/modules/contint/manifests/packages.pp b/modules/contint/manifests/packages.pp index b510f53..eddcf91 100644 --- a/modules/contint/manifests/packages.pp +++ b/modules/contint/manifests/packages.pp @@ -122,17 +122,6 @@ package { ['libcidr0-dev', 'libanon0-dev']: ensure => 'latest', } - - # Used for mobile device classification in Kraken: - package { [ - 'libdclass0', - 'libdclass0-dev', - 'libdclass-jni', - 'libdclass-java', - 'libdclass-data', - ]: - ensure => 'installed', - } } if ubuntu_version('>= trusty') { -- To view, visit https://gerrit.wikimedia.org/r/168147 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I6f2900f2a4643a8fce98b3b637f3dd82ef1d1043 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
