[MediaWiki-commits] [Gerrit] Remove all kraken references - change (operations/puppet)

Ottomata (Code Review) Thu, 23 Oct 2014 06:40:55 -0700

Ottomata has submitted this change and it was merged.

Change subject: Remove all kraken references
......................................................................



Remove all kraken references

The kraken repository (and previously named cluster) have
been deprecated and are no longer supported or used.

Change-Id: I6f2900f2a4643a8fce98b3b637f3dd82ef1d1043
---
D files/ganglia/plugins/kraken_webrequest_loss.py
D files/ganglia/plugins/kraken_webrequest_loss.pyconf
M manifests/misc/monitoring.pp
M manifests/role/analytics.pp
D manifests/role/analytics/kraken.pp
M manifests/role/analytics/refinery.pp
M manifests/role/deployment.pp
M manifests/site.pp
M modules/contint/manifests/packages.pp
9 files changed, 3 insertions(+), 312 deletions(-)

Approvals:
  Ottomata: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/files/ganglia/plugins/kraken_webrequest_loss.py 
b/files/ganglia/plugins/kraken_webrequest_loss.py
deleted file mode 100644
index 8b966a8..0000000
--- a/files/ganglia/plugins/kraken_webrequest_loss.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-    Python Gmond Module for Kraken Webrequest Loss Percentage.
-    Loss percentage per source host data is generated by the packetloss
-    Oozie job in Kraken.
-
-    :copyright: (c) 2012 Wikimedia Foundation
-    :author: Andrew Otto <[email protected]>
-    :license: GPL
-
-"""
-from __future__ import print_function
-
-import logging
-import commands
-
-UPDATE_INTERVAL = 3600 # seconds
-
-# Config for multiple metrics.
-# Currently we only compute a single webrequest loss
-# percentage, but this allows us to add more later.
-metrics = {
-    'webrequest_loss_average': {
-        'description': 'Average Webrequest Loss Percentage',
-        'path':        '/wmf/data/webrequest/loss',
-    }
-}
-
-def latest_loss_path(metric_name):
-    """Returns HDFS path to the most recently generated webrequest loss 
data."""
-    logging.debug("latest_loss_path(%s)" % metrics[metric_name]['path'])
-    return commands.getoutput("/usr/bin/hadoop fs -ls %s | /usr/bin/tail -n 1 
| /usr/bin/awk '{print $NF}'" % (metrics[metric_name]['path']))
-
-def loss_data(loss_path):
-    """Returns the output data inside the HDFS loss_path."""
-    logging.debug("loss_data(%s)" % loss_path)
-    return commands.getoutput("/usr/bin/hadoop fs -cat %s/part*" % (loss_path))
-
-def loss_average(loss_data):
-    """Parses loss_data for loss percentages and averages them all."""
-    logging.debug("loss_average(%s)" % loss_data)
-    percent_sum = 0.0
-    loss_lines = loss_data.split("\n")
-    for line in loss_lines:
-        fields = line.split("\t")
-        percent = fields[-1]
-        percent_sum += float(percent)
-
-    average_percent = (percent_sum / float(len(loss_lines)))
-    return average_percent
-
-def metric_handler(name):
-    """Get value of particular metric; part of Gmond interface"""
-    logging.debug('metric_handler(): %s', name)
-    return loss_average(loss_data(latest_loss_path(name)))
-
-def metric_init(params):
-    global descriptors
-
-    descriptors = []
-    for metric_name, metric_config in metrics.items():
-        descriptors.append({
-            'name': metric_name,
-            'call_back': metric_handler,
-            'time_max': 3660,
-            'value_type': 'float',
-            'units': '%',
-            'slope': 'both',
-            'format': '%f',
-            'description': metric_config['description'],
-            'groups': 'analytics'
-        })
-
-    return descriptors
-
-
-def metric_cleanup():
-    """Teardown; part of Gmond interface"""
-    pass
-
-
-if __name__ == '__main__':
-    # When invoked as standalone script, run a self-test by querying each
-    # metric descriptor and printing it out.
-    logging.basicConfig(level=logging.DEBUG)
-    for metric in metric_init({}):
-        value = metric['call_back'](metric['name'])
-        print(( "%s => " + metric['format'] ) % ( metric['name'], value ))
diff --git a/files/ganglia/plugins/kraken_webrequest_loss.pyconf 
b/files/ganglia/plugins/kraken_webrequest_loss.pyconf
deleted file mode 100644
index 2ea2fea..0000000
--- a/files/ganglia/plugins/kraken_webrequest_loss.pyconf
+++ /dev/null
@@ -1,20 +0,0 @@
-# Gmond configuration for calculating
-# webrequest data loss stored in HDFS in Kraken.
-
-modules {
-  module {
-    name = "kraken_webrequest_loss"
-    language = "python"
-  }
-}
-
-collection_group {
-  collect_every = 3600
-  time_threshold = 3660
-
-  metric {
-    name = "webrequest_loss_average"
-    title = "Average Loss Percentage"
-    value_threshold = 0
-  }
-}
diff --git a/manifests/misc/monitoring.pp b/manifests/misc/monitoring.pp
index b4fa29b..e27147a 100644
--- a/manifests/misc/monitoring.pp
+++ b/manifests/misc/monitoring.pp
@@ -64,37 +64,6 @@
     }
 }
 
-# == Class misc::monitoring::kraken::loss
-# Checks recently generated webrequest loss statistics in
-# Kraken HDFS and sends the average loss percentage to ganglia.
-#
-class misc::monitoring::kraken::loss {
-    file {
-        '/usr/lib/ganglia/python_modules/kraken_webrequest_loss.py':
-            require => File['/usr/lib/ganglia/python_modules'],
-            source  => 
'puppet:///files/ganglia/plugins/kraken_webrequest_loss.py',
-            notify  => Service['gmond'];
-        '/etc/ganglia/conf.d/udp_stats.pyconf':
-            require => 
File['/usr/lib/ganglia/python_modules/kraken_webrequest_loss.py'],
-            source  => 
'puppet:///files/ganglia/plugins/kraken_webrequest_loss.pyconf',
-            notify  => Service['gmond'];
-    }
-
-    # Set up icinga monitoring of Kraken HDFS data loss.
-    monitor_service { 'kraken_webrequest_loss_average_positive':
-        description           => 'webrequest_loss_average_positive',
-        check_command         => 'check_kraken_webrequest_loss_positive!2!8',
-        contact_group         => 'analytics',
-    }
-    # It is possible to have negative data loss.  This would mean that
-    # we are receiving duplicates log lines.  We need alerts for this too.
-    monitor_service { 'kraken_webrequest_loss_average_negative':
-        description           => 'webrequest_loss_average_negative',
-        check_command         => 'check_kraken_webrequest_loss_negative!-2!-8',
-        contact_group         => 'analytics',
-    }
-}
-
 # Ganglia views that should be
 # avaliable on ganglia.wikimedia.org
 class misc::monitoring::views {
diff --git a/manifests/role/analytics.pp b/manifests/role/analytics.pp
index d54501c..24d7e33 100644
--- a/manifests/role/analytics.pp
+++ b/manifests/role/analytics.pp
@@ -18,9 +18,6 @@
 # == Class role::analytics::clients
 # Includes common client classes for
 # working with hadoop and other analytics services.
-# This class is often included by including
-# role::analytics::kraken, but you may include
-# it on its own if you don't need any kraken code.
 class role::analytics::clients {
     include role::analytics
 
diff --git a/manifests/role/analytics/kraken.pp 
b/manifests/role/analytics/kraken.pp
deleted file mode 100644
index e390cec..0000000
--- a/manifests/role/analytics/kraken.pp
+++ /dev/null
@@ -1,148 +0,0 @@
-# kraken.pp - role classes dealing with Kraken data analysis.
-#
-# NOTE!  'kraken' will be renamed soon.
-
-# == Class role::analytics::kraken
-# Kraken refers to the Analytics codebase used to generate
-# analytics for WMF.
-class role::analytics::kraken {
-    # Need Hadoop client classes included to use Kraken.
-    include role::analytics::clients
-
-    # We want to be able to geolocate IP addresses
-    include geoip
-    # udp-filter is a useful thing!
-    include misc::udp2log::udp_filter
-
-    # many Kraken python scripts use docopt for CLI parsing.
-    if !defined(Package['python-docopt']) {
-        package { 'python-docopt':
-            ensure => 'installed',
-        }
-    }
-
-    # Many kraken jobs use dclass for
-    # User Agent Device classification
-    package { 'libdclass-java':
-        ensure  => 'installed',
-    }
-
-    # Include kraken/deploy repository target.
-    package { 'analytics/kraken/deploy':
-        provider => 'trebuchet',
-    }
-    # kraken/deploy repository is deployed via git deploy into here.
-    # You must deploy this yourself, puppet will not do it for you.
-    $path = '/srv/deployment/analytics/kraken/deploy/kraken'
-
-    # Path in HDFS in which external data should be imported.
-    $external_data_hdfs_dir = '/wmf/data/external'
-
-    # Create directory in /var/log for general purpose Kraken job logging.
-    $log_dir = '/var/log/kraken'
-    file { $log_dir:
-        ensure => 'directory',
-        owner  => 'root',
-        group  => 'stats',
-        # setgid bit here to make kraken log files writeable
-        # by users in the stats group.
-        mode   => '2775',
-    }
-
-}
-
-# == Class role::analytics::kraken::jobs::import::kafka
-# Submits Camus MapReduce jobs to import data from Kafka.
-class role::analytics::kraken::jobs::import::kafka {
-    require role::analytics::kraken
-
-    $camus_webrequest_properties = 
"${::role::analytics::kraken::path}/kraken-etl/conf/camus.webrequest.properties"
-    $camus_webrequest_log_file   = 
"${::role::analytics::kraken::log_dir}/camus-webrequest.log"
-    cron { 'kraken-import-hourly-webrequest':
-        command => "${::role::analytics::kraken::path}/kraken-etl/camus 
--job-name camus-webrequest-import ${camus_webrequest_properties} >> 
${camus_webrequest_log_file} 2>&1",
-        user    => 'hdfs',  # we might want to use a different user for this, 
not sure.
-        minute  => '*/10',
-    }
-
-    $camus_eventlogging_properties = 
"${::role::analytics::kraken::path}/kraken-etl/conf/camus.eventlogging.properties"
-    $camus_eventlogging_log_file   = 
"${::role::analytics::kraken::log_dir}/camus-eventlogging.log"
-    cron { 'kraken-import-hourly-eventlogging':
-        command => "${::role::analytics::kraken::path}/kraken-etl/camus 
--job-name camus-eventlogging-import ${camus_eventlogging_properties} >> 
${camus_eventlogging_log_file} 2>&1",
-        user    => 'hdfs',  # we might want to use a different user for this, 
not sure.
-        minute  => '*/10',
-    }
-}
-
-# == Class role::analytics::kraken::import::pagecounts
-# Handles importing of hourly pagecount statistics into
-# HDFS and creating Hive partition tables.
-class role::analytics::kraken::jobs::import::pagecounts {
-    include role::analytics::kraken
-
-    $script      = 
"${role::analytics::kraken::path}/kraken-etl/pagecount-importer"
-    $datadir     = $role::analytics::kraken::external_data_hdfs_dir
-
-    # Don't attempt to import anything before this date.
-    # This imports everything since August 1 2013.
-    $start_date  = '2013.07.31_23'
-
-    # Note:  I'm not worried about logrotate yet.
-    # This generates just a few lines per hour.
-    $log_file     = 
"${role::analytics::kraken::log_dir}/pagecount-importer.log"
-
-    # make sure the script has been deployed.
-    exec { "${script}-exists":
-        command => "/usr/bin/test -f ${script}",
-        # This exec doesn't actually create $script, but
-        # we don't need to run test -f it puppet can already
-        # tell that the file exists.
-        creates => $script,
-    }
-
-    # cron job to download any missing pagecount files from
-    # dumps.wikimedia.org and store them into HDFS.
-    cron { 'kraken-import-hourly-pagecounts':
-        command => "${script} --start ${start_date} ${datadir} >> ${log_file} 
2>&1",
-        user    => 'hdfs',
-        minute  => 5,
-        require => Exec["${script}-exists"],
-    }
-}
-
-# == Class role::analytics::kraken::hive::partitions::external
-# Installs cron job that creates external Hive partitions for imported
-# datasets in $external_data_hdfs_dir.
-class role::analytics::kraken::jobs::hive::partitions::external {
-    include role::analytics::kraken
-
-    $script      = 
"${role::analytics::kraken::path}/kraken-etl/hive-partitioner"
-    $datadir     = $role::analytics::kraken::external_data_hdfs_dir
-    $database    = 'wmf'
-
-    # We are only using hive-partition to add partitions to the pagecounts 
table.
-    # The webrequest table is using Oozie.
-    $tables      = 'pagecounts'
-
-    # Note:  I'm not worried about logrotate yet.
-    # This generates just a few lines per hour.
-    $log_file    = "${role::analytics::kraken::log_dir}/hive-partitioner.log"
-
-    # make sure the script has been deployed.
-    exec { "${script}-exists":
-        command => "/usr/bin/test -x ${script}",
-        # This exec doesn't actually create $script, but
-        # we don't need to run test it puppet can already
-        # tell that the file exists.
-        creates => $script,
-    }
-
-
-    # cron job to automatically create hive partitions for any
-    # newly imported data.
-    cron { 'kraken-create-external-hive-partitions':
-        command => "${script} --database ${database} --tables ${tables} 
${datadir} >> ${log_file} 2>&1",
-        user    => 'hdfs',
-        minute  => 21,
-        require => Exec["${script}-exists"],
-    }
-}
diff --git a/manifests/role/analytics/refinery.pp 
b/manifests/role/analytics/refinery.pp
index 2db7b20..532efad 100644
--- a/manifests/role/analytics/refinery.pp
+++ b/manifests/role/analytics/refinery.pp
@@ -3,7 +3,7 @@
 # and using the analytics/refinery repository.
 #
 class role::analytics::refinery {
-    # Many Kraken python scripts use docopt for CLI parsing.
+    # Some refinery python scripts use docopt for CLI parsing.
     if !defined(Package['python-docopt']) {
         package { 'python-docopt':
             ensure => 'installed',
diff --git a/manifests/role/deployment.pp b/manifests/role/deployment.pp
index 8abe908..adf33df 100644
--- a/manifests/role/deployment.pp
+++ b/manifests/role/deployment.pp
@@ -77,11 +77,6 @@
         'gitfat_enabled' => true,
         'upstream'       => 
'https://gerrit.wikimedia.org/r/operations/software/elasticsearch/plugins',
     },
-    'analytics/kraken/deploy'        => {
-        'gitfat_enabled'      => true,
-        'checkout_submodules' => true,
-        'upstream'            => 
'https://gerrit.wikimedia.org/r/p/analytics/kraken/deploy',
-    },
     'analytics/refinery'        => {
         'gitfat_enabled'      => true,
         'upstream'            => 
'https://gerrit.wikimedia.org/r/analytics/refinery',
diff --git a/manifests/site.pp b/manifests/site.pp
index 77679f0..9bdebe4 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -254,10 +254,8 @@
     include role::logging::udp2log::misc
 }
 
-# analytics1027 hosts the frontend
-# interfaces to Kraken and Hadoop.
-# (Hue, Oozie, Hive, etc.).  It
-# also submits regularly scheduled
+# analytics1027 hosts some frontend web interfaces to Hadoop
+# (Hue, Oozie, Hive, etc.).  It also submits regularly scheduled
 # batch Hadoop jobs.
 node 'analytics1027.eqiad.wmnet' {
     $nagios_group = 'analytics_eqiad'
diff --git a/modules/contint/manifests/packages.pp 
b/modules/contint/manifests/packages.pp
index b510f53..eddcf91 100644
--- a/modules/contint/manifests/packages.pp
+++ b/modules/contint/manifests/packages.pp
@@ -122,17 +122,6 @@
         package { ['libcidr0-dev', 'libanon0-dev']:
             ensure => 'latest',
         }
-
-        # Used for mobile device classification in Kraken:
-        package { [
-            'libdclass0',
-            'libdclass0-dev',
-            'libdclass-jni',
-            'libdclass-java',
-            'libdclass-data',
-            ]:
-            ensure => 'installed',
-        }
     }
 
     if ubuntu_version('>= trusty') {

-- 
To view, visit https://gerrit.wikimedia.org/r/168147
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6f2900f2a4643a8fce98b3b637f3dd82ef1d1043
Gerrit-PatchSet: 2
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Hashar <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: QChris <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Remove all kraken references - change (operations/puppet)

Reply via email to