Joal has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/220752

Change subject: Add new projectview to projectcounts aggregation
......................................................................

Add new projectview to projectcounts aggregation

Remove projectcounts data git repo and cron jobs from aggregator.pp, keeping 
only code git repo.
Add aggregator/projectcounts.pp class, taking care of legacy projectcounts data 
git repo and cron jobs.
Add aggregator/projectview.pp class, taking care of new projectview data git 
repo and cron jobs.

Bug: T101118
Change-Id: I98bc48535720935ace4a888680320602619dcbb1
---
M modules/statistics/manifests/aggregator.pp
A modules/statistics/manifests/aggregator/projectcounts.pp
A modules/statistics/manifests/aggregator/projectview.pp
3 files changed, 155 insertions(+), 55 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/52/220752/1

diff --git a/modules/statistics/manifests/aggregator.pp 
b/modules/statistics/manifests/aggregator.pp
index a114f87..1024aaa 100644
--- a/modules/statistics/manifests/aggregator.pp
+++ b/modules/statistics/manifests/aggregator.pp
@@ -1,26 +1,12 @@
 # == Class statistics::aggregator
-# Handles aggregation of pagecounts-all-sites projectcounts files
-# TODO: Should this be in its own module?
+# Handles projectcounts aggregation code
 #
 class statistics::aggregator {
     Class['::statistics'] -> Class['::statistics::aggregator']
 
-    # This class uses the cdh::hadoop::mount in order to get
-    # data files out of HDFS.
-    Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator']
-
     $working_path     = "${::statistics::working_path}/aggregator"
 
     $script_path      = "${working_path}/scripts"
-    $data_repo_path   = "${working_path}/data"
-    $data_path        = "${data_repo_path}/projectcounts"
-    $log_path         = "${working_path}/log"
-    # This should not be hardcoded.  Instead, one should be able to use
-    # $::cdh::hadoop::mount::mount_point to reference the user supplied
-    # parameter when the cdh::hadoop::mount class is evaluated.
-    # I am not sure why this is not working.
-    $hdfs_mount_point = '/mnt/hdfs'
-    $hdfs_source_path = 
"${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites"
     $user             = $::statistics::user::username
     $group            = $::statistics::user::username
 
@@ -41,44 +27,4 @@
         require   => File[$working_path],
     }
 
-    git::clone { 'aggregator_data':
-        ensure    => 'latest',
-        directory => $data_repo_path,
-        origin    => 
'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git',
-        owner     => $user,
-        group     => $group,
-        mode      => '0755',
-        require   => File[$working_path],
-    }
-
-    file { $log_path:
-        ensure  => 'directory',
-        owner   => $user,
-        group   => $group,
-        mode    => '0755',
-        require => File[$working_path],
-
-    }
-
-    # Cron for doing the basic aggregation step itself
-    cron { 'aggregator projectcounts aggregate':
-        command => "${script_path}/bin/aggregate_projectcounts --source 
${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' 
+\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` 
--push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log",
-        user    => $user,
-        hour    => '13',
-        minute  => '0',
-        require => [
-            Git::Clone['aggregator_code'],
-            Git::Clone['aggregator_data'],
-            File[$log_path],
-        ],
-    }
-
-    # Cron for basing monitoring of the aggregated data
-    cron { 'aggregator projectcounts monitor':
-        command => "${script_path}/bin/check_validity_aggregated_projectcounts 
--data ${data_path}",
-        user    => $user,
-        hour    => '13',
-        minute  => '45',
-        require => Cron['aggregator projectcounts aggregate'],
-    }
 }
diff --git a/modules/statistics/manifests/aggregator/projectcounts.pp 
b/modules/statistics/manifests/aggregator/projectcounts.pp
new file mode 100644
index 0000000..079c21d
--- /dev/null
+++ b/modules/statistics/manifests/aggregator/projectcounts.pp
@@ -0,0 +1,78 @@
+# == Class statistics::aggregator::projectcounts
+# Handles aggregation of pagecounts-all-sites projectcounts files
+#
+# WARNING - Files aggregated by this instance are legacy ones
+# A new pageview definition has been provided and aggregation
+# for it can be found in the same folder: projectview.pp
+#
+class statistics::aggregator::projectcounts {
+    Class['::statistics::aggregator'] -> 
Class['::statistics::aggregator::projectcounts']
+
+    # This class uses the cdh::hadoop::mount in order to get
+    # data files out of HDFS.
+    Class['cdh::hadoop::mount'] -> 
Class['::statistics::aggregator::projectcounts']
+
+    $script_path      = $::statistics::aggregator::script_path
+    $working_path     = 
"${::statistics::aggregator::working_path}/projectcounts"
+    $data_repo_path   = "${working_path}/data"
+    $data_path        = "${data_repo_path}/projectcounts"
+    $log_path         = "${working_path}/log"
+    # This should not be hardcoded.  Instead, one should be able to use
+    # $::cdh::hadoop::mount::mount_point to reference the user supplied
+    # parameter when the cdh::hadoop::mount class is evaluated.
+    # I am not sure why this is not working.
+    $hdfs_mount_point = '/mnt/hdfs'
+    $hdfs_source_path = 
"${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites"
+    $user             = $::statistics::user::username
+    $group            = $::statistics::user::username
+
+    file { $working_path:
+        ensure => 'directory',
+        owner  => $user,
+        group  => $group,
+        mode   => '0755'
+    }
+
+    git::clone { 'aggregator_data':
+        ensure    => 'latest',
+        directory => $data_repo_path,
+        origin    => 
'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git',
+        owner     => $user,
+        group     => $group,
+        mode      => '0755',
+        require   => File[$working_path],
+    }
+
+    file { $log_path:
+        ensure  => 'directory',
+        owner   => $user,
+        group   => $group,
+        mode    => '0755',
+        require => File[$working_path],
+
+    }
+
+    # Cron for doing the basic aggregation step itself
+    cron { 'aggregator projectcounts aggregate':
+        command => "${script_path}/bin/aggregate_projectcounts --source 
${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' 
+\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` 
--push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log",
+        user    => $user,
+        hour    => '13',
+        minute  => '0',
+        require => [
+            # Dependency from aggregator.pp
+            Git::Clone['aggregator_code'],
+
+            Git::Clone['aggregator_data'],
+            File[$log_path],
+        ],
+    }
+
+    # Cron for basing monitoring of the aggregated data
+    cron { 'aggregator projectcounts monitor':
+        command => "${script_path}/bin/check_validity_aggregated_projectcounts 
--data ${data_path}",
+        user    => $user,
+        hour    => '13',
+        minute  => '45',
+        require => Cron['aggregator projectcounts aggregate'],
+    }
+}
diff --git a/modules/statistics/manifests/aggregator/projectview.pp 
b/modules/statistics/manifests/aggregator/projectview.pp
new file mode 100644
index 0000000..b4596d2
--- /dev/null
+++ b/modules/statistics/manifests/aggregator/projectview.pp
@@ -0,0 +1,76 @@
+# == Class statistics::aggregator::projectview
+# Handles aggregation of projectview_hourly files
+#
+# WARNING - Files aggregated by this instance are using the
+# new pageview definition. The legacy ones are managed by
+# projectcounts.pp in the same folder.
+#
+class statistics::aggregator::projectview {
+    Class['::statistics::aggregator'] -> 
Class['::statistics::aggregator::projectview']
+
+    # This class uses the cdh::hadoop::mount in order to get
+    # data files out of HDFS.
+    Class['cdh::hadoop::mount'] -> 
Class['::statistics::aggregator::projectview']
+
+    $script_path      = $::statistics::aggregator::script_path
+    $working_path     = "${::statistics::aggregator::working_path}/projectview"
+    $data_repo_path   = "${working_path}/data"
+    $data_path        = "${data_repo_path}/projectview"
+    $log_path         = "${working_path}/log"
+    # This should not be hardcoded.  Instead, one should be able to use
+    # $::cdh::hadoop::mount::mount_point to reference the user supplied
+    # parameter when the cdh::hadoop::mount class is evaluated.
+    # I am not sure why this is not working.
+    $hdfs_mount_point = '/mnt/hdfs'
+    $hdfs_source_path = 
"${hdfs_mount_point}/wmf/data/archive/projectview/webstatcollector/hourly"
+    $user             = $::statistics::user::username
+    $group            = $::statistics::user::username
+
+    file { $working_path:
+        ensure => 'directory',
+        owner  => $user,
+        group  => $group,
+        mode   => '0755'
+    }
+
+    git::clone { 'aggregator_projectview_data':
+        ensure    => 'latest',
+        directory => $data_repo_path,
+        origin    => 
'https://gerrit.wikimedia.org/r/p/analytics/aggregator/projectview/data.git',
+        owner     => $user,
+        group     => $group,
+        mode      => '0755',
+        require   => File[$working_path],
+    }
+
+    file { $log_path:
+        ensure  => 'directory',
+        owner   => $user,
+        group   => $group,
+        mode    => '0755',
+        require => File[$working_path],
+
+    }
+
+    # Cron for doing the basic aggregation step itself
+    cron { 'aggregator projectview aggregate':
+        command => "${script_path}/bin/aggregate_projectcounts --source 
${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' 
+\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` 
--push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log",
+        user    => $user,
+        hour    => '13',
+        minute  => '0',
+        require => [
+            Git::Clone['aggregator_code'],
+            Git::Clone['aggregator_projectview_data'],
+            File[$log_path],
+        ],
+    }
+
+    # Cron for basing monitoring of the aggregated data
+    cron { 'aggregator projectview monitor':
+        command => "${script_path}/bin/check_validity_aggregated_projectcounts 
--data ${data_path}",
+        user    => $user,
+        hour    => '13',
+        minute  => '45',
+        require => Cron['aggregator projectview aggregate'],
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/220752
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I98bc48535720935ace4a888680320602619dcbb1
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Joal <j...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to