Joal has uploaded a new change for review. https://gerrit.wikimedia.org/r/220752
Change subject: Add new projectview to projectcounts aggregation ...................................................................... Add new projectview to projectcounts aggregation Remove projectcounts data git repo and cron jobs from aggregator.pp, keeping only code git repo. Add aggregator/projectcounts.pp class, taking care of legacy projectcounts data git repo and cron jobs. Add aggregator/projectview.pp class, taking care of new projectview data git repo and cron jobs. Bug: T101118 Change-Id: I98bc48535720935ace4a888680320602619dcbb1 --- M modules/statistics/manifests/aggregator.pp A modules/statistics/manifests/aggregator/projectcounts.pp A modules/statistics/manifests/aggregator/projectview.pp 3 files changed, 155 insertions(+), 55 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/52/220752/1 diff --git a/modules/statistics/manifests/aggregator.pp b/modules/statistics/manifests/aggregator.pp index a114f87..1024aaa 100644 --- a/modules/statistics/manifests/aggregator.pp +++ b/modules/statistics/manifests/aggregator.pp @@ -1,26 +1,12 @@ # == Class statistics::aggregator -# Handles aggregation of pagecounts-all-sites projectcounts files -# TODO: Should this be in its own module? +# Handles projectcounts aggregation code # class statistics::aggregator { Class['::statistics'] -> Class['::statistics::aggregator'] - # This class uses the cdh::hadoop::mount in order to get - # data files out of HDFS. - Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator'] - $working_path = "${::statistics::working_path}/aggregator" $script_path = "${working_path}/scripts" - $data_repo_path = "${working_path}/data" - $data_path = "${data_repo_path}/projectcounts" - $log_path = "${working_path}/log" - # This should not be hardcoded. Instead, one should be able to use - # $::cdh::hadoop::mount::mount_point to reference the user supplied - # parameter when the cdh::hadoop::mount class is evaluated. - # I am not sure why this is not working. - $hdfs_mount_point = '/mnt/hdfs' - $hdfs_source_path = "${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites" $user = $::statistics::user::username $group = $::statistics::user::username @@ -41,44 +27,4 @@ require => File[$working_path], } - git::clone { 'aggregator_data': - ensure => 'latest', - directory => $data_repo_path, - origin => 'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git', - owner => $user, - group => $group, - mode => '0755', - require => File[$working_path], - } - - file { $log_path: - ensure => 'directory', - owner => $user, - group => $group, - mode => '0755', - require => File[$working_path], - - } - - # Cron for doing the basic aggregation step itself - cron { 'aggregator projectcounts aggregate': - command => "${script_path}/bin/aggregate_projectcounts --source ${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' +\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` --push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log", - user => $user, - hour => '13', - minute => '0', - require => [ - Git::Clone['aggregator_code'], - Git::Clone['aggregator_data'], - File[$log_path], - ], - } - - # Cron for basing monitoring of the aggregated data - cron { 'aggregator projectcounts monitor': - command => "${script_path}/bin/check_validity_aggregated_projectcounts --data ${data_path}", - user => $user, - hour => '13', - minute => '45', - require => Cron['aggregator projectcounts aggregate'], - } } diff --git a/modules/statistics/manifests/aggregator/projectcounts.pp b/modules/statistics/manifests/aggregator/projectcounts.pp new file mode 100644 index 0000000..079c21d --- /dev/null +++ b/modules/statistics/manifests/aggregator/projectcounts.pp @@ -0,0 +1,78 @@ +# == Class statistics::aggregator::projectcounts +# Handles aggregation of pagecounts-all-sites projectcounts files +# +# WARNING - Files aggregated by this instance are legacy ones +# A new pageview definition has been provided and aggregation +# for it can be found in the same folder: projectview.pp +# +class statistics::aggregator::projectcounts { + Class['::statistics::aggregator'] -> Class['::statistics::aggregator::projectcounts'] + + # This class uses the cdh::hadoop::mount in order to get + # data files out of HDFS. + Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator::projectcounts'] + + $script_path = $::statistics::aggregator::script_path + $working_path = "${::statistics::aggregator::working_path}/projectcounts" + $data_repo_path = "${working_path}/data" + $data_path = "${data_repo_path}/projectcounts" + $log_path = "${working_path}/log" + # This should not be hardcoded. Instead, one should be able to use + # $::cdh::hadoop::mount::mount_point to reference the user supplied + # parameter when the cdh::hadoop::mount class is evaluated. + # I am not sure why this is not working. + $hdfs_mount_point = '/mnt/hdfs' + $hdfs_source_path = "${hdfs_mount_point}/wmf/data/archive/pagecounts-all-sites" + $user = $::statistics::user::username + $group = $::statistics::user::username + + file { $working_path: + ensure => 'directory', + owner => $user, + group => $group, + mode => '0755' + } + + git::clone { 'aggregator_data': + ensure => 'latest', + directory => $data_repo_path, + origin => 'https://gerrit.wikimedia.org/r/p/analytics/aggregator/data.git', + owner => $user, + group => $group, + mode => '0755', + require => File[$working_path], + } + + file { $log_path: + ensure => 'directory', + owner => $user, + group => $group, + mode => '0755', + require => File[$working_path], + + } + + # Cron for doing the basic aggregation step itself + cron { 'aggregator projectcounts aggregate': + command => "${script_path}/bin/aggregate_projectcounts --source ${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' +\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` --push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log", + user => $user, + hour => '13', + minute => '0', + require => [ + # Dependency from aggregator.pp + Git::Clone['aggregator_code'], + + Git::Clone['aggregator_data'], + File[$log_path], + ], + } + + # Cron for basing monitoring of the aggregated data + cron { 'aggregator projectcounts monitor': + command => "${script_path}/bin/check_validity_aggregated_projectcounts --data ${data_path}", + user => $user, + hour => '13', + minute => '45', + require => Cron['aggregator projectcounts aggregate'], + } +} diff --git a/modules/statistics/manifests/aggregator/projectview.pp b/modules/statistics/manifests/aggregator/projectview.pp new file mode 100644 index 0000000..b4596d2 --- /dev/null +++ b/modules/statistics/manifests/aggregator/projectview.pp @@ -0,0 +1,76 @@ +# == Class statistics::aggregator::projectview +# Handles aggregation of projectview_hourly files +# +# WARNING - Files aggregated by this instance are using the +# new pageview definition. The legacy ones are managed by +# projectcounts.pp in the same folder. +# +class statistics::aggregator::projectview { + Class['::statistics::aggregator'] -> Class['::statistics::aggregator::projectview'] + + # This class uses the cdh::hadoop::mount in order to get + # data files out of HDFS. + Class['cdh::hadoop::mount'] -> Class['::statistics::aggregator::projectview'] + + $script_path = $::statistics::aggregator::script_path + $working_path = "${::statistics::aggregator::working_path}/projectview" + $data_repo_path = "${working_path}/data" + $data_path = "${data_repo_path}/projectview" + $log_path = "${working_path}/log" + # This should not be hardcoded. Instead, one should be able to use + # $::cdh::hadoop::mount::mount_point to reference the user supplied + # parameter when the cdh::hadoop::mount class is evaluated. + # I am not sure why this is not working. + $hdfs_mount_point = '/mnt/hdfs' + $hdfs_source_path = "${hdfs_mount_point}/wmf/data/archive/projectview/webstatcollector/hourly" + $user = $::statistics::user::username + $group = $::statistics::user::username + + file { $working_path: + ensure => 'directory', + owner => $user, + group => $group, + mode => '0755' + } + + git::clone { 'aggregator_projectview_data': + ensure => 'latest', + directory => $data_repo_path, + origin => 'https://gerrit.wikimedia.org/r/p/analytics/aggregator/projectview/data.git', + owner => $user, + group => $group, + mode => '0755', + require => File[$working_path], + } + + file { $log_path: + ensure => 'directory', + owner => $user, + group => $group, + mode => '0755', + require => File[$working_path], + + } + + # Cron for doing the basic aggregation step itself + cron { 'aggregator projectview aggregate': + command => "${script_path}/bin/aggregate_projectcounts --source ${hdfs_source_path} --target ${data_path} --first-date=`date --date='-8 day' +\\%Y-\\%m-\\%d` --last-date=`date --date='-1 day' +\\%Y-\\%m-\\%d` --push-target --log ${log_path}/`date +\\%Y-\\%m-\\%d--\\%H-\\%M-\\%S`.log", + user => $user, + hour => '13', + minute => '0', + require => [ + Git::Clone['aggregator_code'], + Git::Clone['aggregator_projectview_data'], + File[$log_path], + ], + } + + # Cron for basing monitoring of the aggregated data + cron { 'aggregator projectview monitor': + command => "${script_path}/bin/check_validity_aggregated_projectcounts --data ${data_path}", + user => $user, + hour => '13', + minute => '45', + require => Cron['aggregator projectview aggregate'], + } +} -- To view, visit https://gerrit.wikimedia.org/r/220752 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I98bc48535720935ace4a888680320602619dcbb1 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Joal <j...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits