Ottomata has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/364718 )
Change subject: Create reportupdater::jobs profiles for stat boxes
......................................................................
Create reportupdater::jobs profiles for stat boxes
This makes it a little easier to conditionally include reportupdater in
different roles,
and helps with the new stat box migration.
Bug: T152712
Change-Id: I51f5b85f7c378f250e477dcca625cc3e9b3ae99b
---
A modules/profile/manifests/reportupdater/README
A modules/profile/manifests/reportupdater/jobs/hadoop.pp
A modules/profile/manifests/reportupdater/jobs/mysql.pp
M modules/role/manifests/statistics/cruncher.pp
M modules/role/manifests/statistics/private.pp
M modules/statistics/manifests/init.pp
6 files changed, 126 insertions(+), 117 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/18/364718/1
diff --git a/modules/profile/manifests/reportupdater/README
b/modules/profile/manifests/reportupdater/README
new file mode 100644
index 0000000..b68d758
--- /dev/null
+++ b/modules/profile/manifests/reportupdater/README
@@ -0,0 +1,4 @@
+ReportUpdater is an Analytics owned set of scripts and conventions that make
it easier
+for users interested in generating regular reports, usually by running SQL
queries and
+outputting publically accessible TSVs.
+
diff --git a/modules/profile/manifests/reportupdater/jobs/hadoop.pp
b/modules/profile/manifests/reportupdater/jobs/hadoop.pp
new file mode 100644
index 0000000..990cc14
--- /dev/null
+++ b/modules/profile/manifests/reportupdater/jobs/hadoop.pp
@@ -0,0 +1,43 @@
+# == Class profile::reportupdater::jobs::hadoop
+# Installs reportupdater jobs that run on Hadoop/Hive.
+# This profile should only be included in a single role.
+#
+# This requires that a Hadoop client is installed and the statistics compute
role
+# for the published_datasets_path.
+class profile::reportupdater::jobs::hadoop {
+ # TODO: it would be better to depend on role::analytics_cluster::client,
but
+ # that seems wrong from a profile. Perhaps when analytics_cluster roles
+ # have been refactored to profiles.
+ Class['cdh::hadoop'] -> Class['profile::reportupdater::jobs::hive']
+
+ require ::statistics::compute
+
+ # Set up reportupdater.
+ # Reportupdater here launches Hadoop jobs, and
+ # the 'hdfs' user is the only 'system' user that has
+ # access to required files in Hadoop.
+ class { 'reportupdater':
+ user => 'hdfs',
+ }
+
+ # And set up a link for periodic jobs to be included in published reports.
+ # Because periodic is in published_datasets_path, files will be synced to
+ # analytics.wikimedia.org/datasets/periodic/reports
+ file { "${::statistics::compute::published_datasets_path}/periodic":
+ ensure => 'directory',
+ owner => 'root',
+ group => 'wikidev',
+ mode => '0775',
+ }
+ file {
"${::statistics::compute::published_datasets_path}/periodic/reports":
+ ensure => 'link',
+ target => "${::statistics::working_path}/reportupdater/output",
+ require => Class['reportupdater'],
+ }
+
+ # Set up a job to create browser reports on hive db.
+ reportupdater::job { 'browser':
+ repository => 'reportupdater-queries',
+ output_dir => 'metrics/browser',
+ }
+}
diff --git a/modules/profile/manifests/reportupdater/jobs/mysql.pp
b/modules/profile/manifests/reportupdater/jobs/mysql.pp
new file mode 100644
index 0000000..6ba2de2
--- /dev/null
+++ b/modules/profile/manifests/reportupdater/jobs/mysql.pp
@@ -0,0 +1,74 @@
+# == Class profile::reportupdater::jobs::mysql
+#
+# Installs reportupdater package, and sets up jobs that run reports and
generate output from
+# MySQL analytics slaves. This profile should only be included in a single
role.
+#
+# This requires the statistics module for the stats user and the
published_datasets_path.
+#
+class profile::reportupdater {
+ require statistics
+ require statistics::compute
+
+ # Set up reportupdater to be executed on this machine
+ class { 'reportupdater':
+ user => $::statistics::user::username,
+ }
+
+ # And set up a link for periodic jobs to be included in published reports.
+ # Because periodic is in published_datasets_path, files will be synced to
+ # analytics.wikimedia.org/datasets/periodic/reports
+ file { "${::statistics::compute::published_datasets_path}/periodic":
+ ensure => 'directory',
+ owner => 'root',
+ group => 'wikidev',
+ mode => '0775',
+ }
+ file {
"${::statistics::compute::published_datasets_path}/periodic/reports":
+ ensure => 'link',
+ target => '/srv/reportupdater/output',
+ require => Class['reportupdater'],
+ }
+
+ # Set up various jobs to be executed by reportupdater
+ # creating several reports on mysql research db.
+ reportupdater::job { 'flow':
+ repository => 'limn-flow-data',
+ output_dir => 'flow/datafiles',
+ }
+ reportupdater::job { 'flow-beta-features':
+ repository => 'limn-flow-data',
+ output_dir => 'metrics/beta-feature-enables',
+ }
+ reportupdater::job { 'edit':
+ repository => 'limn-edit-data',
+ output_dir => 'metrics',
+ }
+ reportupdater::job { 'edit-beta-features':
+ repository => 'limn-edit-data',
+ output_dir => 'metrics/beta-feature-enables',
+ }
+ reportupdater::job { 'language':
+ repository => 'limn-language-data',
+ output_dir => 'metrics/beta-feature-enables',
+ }
+ reportupdater::job { 'ee':
+ repository => 'limn-ee-data',
+ output_dir => 'metrics/echo',
+ }
+ reportupdater::job { 'ee-beta-features':
+ repository => 'limn-ee-data',
+ output_dir => 'metrics/beta-feature-enables',
+ }
+ reportupdater::job { 'multimedia':
+ repository => 'limn-multimedia-data',
+ output_dir => 'metrics/multimedia-health',
+ }
+ reportupdater::job { 'ee-migration':
+ repository => 'limn-ee-data',
+ output_dir => 'metrics/ee',
+ }
+ reportupdater::job { 'interactive':
+ repository => 'discovery-stats',
+ output_dir => 'metrics/interactive',
+ }
+}
\ No newline at end of file
diff --git a/modules/role/manifests/statistics/cruncher.pp
b/modules/role/manifests/statistics/cruncher.pp
index efba497..b8a3834 100644
--- a/modules/role/manifests/statistics/cruncher.pp
+++ b/modules/role/manifests/statistics/cruncher.pp
@@ -1,4 +1,4 @@
-# (stat1003)
+# (stat1003 / stat1006)
class role::statistics::cruncher inherits role::statistics::base {
system::role { 'statistics::cruncher':
description => 'Statistics general compute node (non private data)'
@@ -19,6 +19,7 @@
# rsync logs from logging hosts
include ::statistics::rsync::eventlogging
+ include ::profile::reportupdater::jobs::mysql
# geowiki: bringing data from production slave db to research db
include geowiki::job::data
@@ -27,68 +28,4 @@
# geowiki: monitors the geowiki files of http://gp.wmflabs.org/
include geowiki::job::monitoring
-
- # Set up reportupdater to be executed on this machine
- class { 'reportupdater':
- base_path => "${::statistics::working_path}/reportupdater",
- user => $::statistics::user::username,
- }
-
- # And set up a link for periodic jobs to be included in published reports.
- # Because periodic is in published_datasets_path, files will be synced to
- # analytics.wikimedia.org/datasets/periodic/reports
- file { "${::statistics::compute::published_datasets_path}/periodic":
- ensure => 'directory',
- owner => 'root',
- group => 'wikidev',
- mode => '0775',
- }
- file {
"${::statistics::compute::published_datasets_path}/periodic/reports":
- ensure => 'link',
- target => "${::statistics::working_path}/reportupdater/output",
- require => Class['reportupdater'],
- }
-
- # Set up various jobs to be executed by reportupdater
- # creating several reports on mysql research db.
- reportupdater::job { 'flow':
- repository => 'limn-flow-data',
- output_dir => 'flow/datafiles',
- }
- reportupdater::job { 'flow-beta-features':
- repository => 'limn-flow-data',
- output_dir => 'metrics/beta-feature-enables',
- }
- reportupdater::job { 'edit':
- repository => 'limn-edit-data',
- output_dir => 'metrics',
- }
- reportupdater::job { 'edit-beta-features':
- repository => 'limn-edit-data',
- output_dir => 'metrics/beta-feature-enables',
- }
- reportupdater::job { 'language':
- repository => 'limn-language-data',
- output_dir => 'metrics/beta-feature-enables',
- }
- reportupdater::job { 'ee':
- repository => 'limn-ee-data',
- output_dir => 'metrics/echo',
- }
- reportupdater::job { 'ee-beta-features':
- repository => 'limn-ee-data',
- output_dir => 'metrics/beta-feature-enables',
- }
- reportupdater::job { 'multimedia':
- repository => 'limn-multimedia-data',
- output_dir => 'metrics/multimedia-health',
- }
- reportupdater::job { 'ee-migration':
- repository => 'limn-ee-data',
- output_dir => 'metrics/ee',
- }
- reportupdater::job { 'interactive':
- repository => 'discovery-stats',
- output_dir => 'metrics/interactive',
- }
}
diff --git a/modules/role/manifests/statistics/private.pp
b/modules/role/manifests/statistics/private.pp
index b75e097..fd8da86 100644
--- a/modules/role/manifests/statistics/private.pp
+++ b/modules/role/manifests/statistics/private.pp
@@ -29,50 +29,16 @@
# private. We just keep it here to spare adding a separate role.
include ::statistics::aggregator::projectview
- include passwords::mysql::research
# This file will render at
# /etc/mysql/conf.d/statistics-private-client.cnf.
# This is so that users in the statistics-privatedata-users
# group who want to access the research slave dbs do not
# have to be in the research group, which is not included
# in the private role.
- mysql::config::client { 'statistics-private':
- user => $::passwords::mysql::research::user,
- pass => $::passwords::mysql::research::pass,
+ statistics::mysql_credentials { 'statistics-private':
group => 'statistics-privatedata-users',
- mode => '0440',
}
- # Set up reportupdater to be executed on this machine.
- # Reportupdater launches Hadoop jobs, and
- # the 'hdfs' user is the only 'system' user that has
- # access to required files in Hadoop.
- class { 'reportupdater':
- base_path => "${::statistics::working_path}/reportupdater",
- user => 'hdfs',
- # We know that this is included, but unfortunetly
- # it is done so outside of this role. Perhaps
- # reportupdater should have its own role!
- require => Class['cdh::hadoop'],
- }
- # And set up a link for periodic jobs to be included in published reports.
- # Because periodic is in published_datasets_path, files will be synced to
- # analytics.wikimedia.org/datasets/periodic/reports
- file { "${::statistics::compute::published_datasets_path}/periodic":
- ensure => 'directory',
- owner => 'root',
- group => 'wikidev',
- mode => '0775',
- }
- file {
"${::statistics::compute::published_datasets_path}/periodic/reports":
- ensure => 'link',
- target => "${::statistics::working_path}/reportupdater/output",
- require => Class['reportupdater'],
- }
-
- # Set up a job to create browser reports on hive db.
- reportupdater::job { 'browser':
- repository => 'reportupdater-queries',
- output_dir => 'metrics/browser',
- }
+ # Run Hadoop/Hive reportupdater jobs here.
+ include ::profile::reportupdater::jobs::hadoop
}
diff --git a/modules/statistics/manifests/init.pp
b/modules/statistics/manifests/init.pp
index 616a49d..0844efe 100644
--- a/modules/statistics/manifests/init.pp
+++ b/modules/statistics/manifests/init.pp
@@ -14,21 +14,6 @@
) {
include ::statistics::user
- file { $working_path:
- ensure => 'directory',
- owner => 'root',
- group => 'wikidev',
- mode => '0775',
- }
-
- if $working_path == '/srv' {
- # symlink /a to /srv for backwards compatibility
- file { '/a':
- ensure => 'link',
- target => '/srv',
- }
- }
-
# set up rsync modules for copying files
# on statistic servers in $working_path
class { '::statistics::rsyncd':
--
To view, visit https://gerrit.wikimedia.org/r/364718
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I51f5b85f7c378f250e477dcca625cc3e9b3ae99b
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits