Smalyshev has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/373354 )

Change subject: [WIP] Add RDF dumps for categories
......................................................................

[WIP] Add RDF dumps for categories

Creates RDF dump for each wiki configured in categories-rdf dblist
in other/categoriesrdf/YYMMDD.
Keeps old dumps back for 70 days.
The dumps are kept in TTL format.

Bug:
Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5
---
A modules/snapshot/files/cron/dumpcategoriesrdf.sh
A modules/snapshot/files/cron/logrotate.categoriesrdf
M modules/snapshot/manifests/cron.pp
A modules/snapshot/manifests/cron/categoriesrdf.pp
4 files changed, 173 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/54/373354/1

diff --git a/modules/snapshot/files/cron/dumpcategoriesrdf.sh 
b/modules/snapshot/files/cron/dumpcategoriesrdf.sh
new file mode 100755
index 0000000..b128a62
--- /dev/null
+++ b/modules/snapshot/files/cron/dumpcategoriesrdf.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+#############################################################
+# This file is maintained by puppet!
+# modules/snapshot/cron/dumpcategoriesrdf.sh
+#############################################################
+#
+# Generate an RDF dump of categories for all wikis in 
+# categories-rdf list and remove old ones.
+
+source /usr/local/etc/set_dump_dirs.sh
+
+usage() {
+       echo "Usage: $0 [--config <pathtofile>] [--dryrun]"
+       echo
+       echo "  --config  path to configuration file for dump generation"
+       echo "            (default value: ${confsdir}/wikidump.conf"
+       echo "  --dryrun  don't run dump, show what would have been done"
+       exit 1
+}
+
+configFile="${confsdir}/wikidump.conf"
+dryrun="false"
+dumpFormat="ttl"
+dbList="categories-rdf"
+
+while [ $# -gt 0 ]; do
+       if [ $1 == "--config" ]; then
+               configFile="$2"
+               shift; shift;
+       elif [ $1 == "--dryrun" ]; then
+               dryrun="true"
+               shift
+       else
+               echo "$0: Unknown option $1"
+               usage
+       fi
+done
+
+if [ ! -f "$configFile" ]; then
+       echo "Could not find config file: $configFile"
+       echo "Exiting..."
+       exit 1
+fi
+
+deployDir=$(egrep "^dir=" "$configFile" | mawk -Fdir= '{ print $2 }')
+gzip=$(egrep "^gzip=" "$configFile" | mawk -Fgzip= '{ print $2 }')
+privateList=$(egrep "^privatelist=" "$configFile" | mawk -Fprivatelist= '{ 
print $2 }')
+publicDir=$(egrep "^public=" "$configFile" | mawk -Fpublic= '{ print $2 }')
+
+if [ -z "$deployDir" -o -z "$gzip" -o -z "$privateList" -o -z "$publicDir" ]; 
then
+       echo "failed to find value of one of the following from config file 
$configFile:"
+       echo "gzip: $gzip"
+       echo "dir: $deployDir"
+       echo "privatelist: $privateList"
+       echo "public: $publicDir"
+       echo "exiting..."
+       exit 1
+fi
+
+today=$(date +'%Y%m%d')
+targetDirBase="$publicDir/other/categoriesrdf"
+targetDir="$targetDirBase/$today"
+timestampsDir="$targetDirBase/lastdump"
+multiVersionScript="$deployDir/multiversion/MWScript.php"
+
+# remove old datasets
+daysToKeep=70
+cutOff=$(( $(date +%s) - $(( $daysToKeep + 1 )) * 24 * 3600))
+if [ -d "$targetDirBase" ]; then
+       for folder in $(ls -d -r $targetDirBase/*); do
+               creationTime=$(date --utc --date="$(basename $folder)" +%s 
2>/dev/null)
+               if [ -n "$creationTime" ]; then
+                   if [ "$cutOff" -gt "$creationTime" ]; then
+                       if [ "$dryrun" == "true" ]; then
+                               echo rm $folder/*.$dumpFormat.gz
+                               echo rmdir $folder
+                       else
+                               rm -f $folder/*.$dumpFormat.gz
+                               rmdir $folder
+                       fi
+                   fi
+               fi
+       done
+fi
+
+# create todays folder
+if [ "$dryrun" == "true" ]; then
+       echo mkdir -p "$targetDir"
+       echo mkdir -p "$timestampsDir"
+else
+       if ! mkdir -p "$targetDir"; then
+               echo "Can't make output directory: $targetDir"
+               echo "Exiting..."
+               exit 1
+       fi
+       if ! mkdir -p "$timestampsDir"; then
+               echo "Can't make output directory: $timestampsDir"
+               echo "Exiting..."
+               exit 1
+       fi
+fi
+
+# iterate over configured wikis
+/usr/local/bin/expanddblist $dbList | while read wiki; do
+       # exclude all private wikis
+       if ! egrep -q "^$wiki$" $privateList; then
+               filename="$wiki-$today-categories"
+                       targetFile="$targetDir/$filename.$dumpFormat.gz"
+                       tsFile="$timestampsDir/$wiki-categories.last"
+                       if [ "$dryrun" == "true" ]; then
+                               echo "php $multiVersionScript 
maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2> 
/var/log/categoriesrdf/$filename.log | $gzip > $targetFile"
+                               echo $today > $tsFile
+                       else
+                               php $multiVersionScript 
maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2> 
/var/log/categoriesrdf/$filename.log | $gzip > $targetFile
+                       fi
+               done
+       fi
+done
+
+
+# Maintain a 'latest' symlink always pointing at the most recently completed 
dump
+if [ "$dryrun" == "false" ]; then
+       cd "$targetDirBase"
+       ln -snf "$today" "latest"
+fi
diff --git a/modules/snapshot/files/cron/logrotate.categoriesrdf 
b/modules/snapshot/files/cron/logrotate.categoriesrdf
new file mode 100644
index 0000000..af0cac5
--- /dev/null
+++ b/modules/snapshot/files/cron/logrotate.categoriesrdf
@@ -0,0 +1,11 @@
+# This file is managed by puppet
+# puppet:///modules/snapshot/cron/logrotate.categoriesrdf
+#
+/var/log/categoriesrdf/*.log {
+    daily
+    compress
+    delaycompress
+    missingok
+    maxage 22
+    nocreate
+}
diff --git a/modules/snapshot/manifests/cron.pp 
b/modules/snapshot/manifests/cron.pp
index 9e8441e..c5a25a6 100644
--- a/modules/snapshot/manifests/cron.pp
+++ b/modules/snapshot/manifests/cron.pp
@@ -4,6 +4,7 @@
     class { '::snapshot::cron::mediaperprojectlists': user => $user }
     class { '::snapshot::cron::pagetitles': user   => $user }
     class { '::snapshot::cron::cirrussearch': user   => $user }
+    class { '::snapshot::cron::categoriesrdf': user   => $user }
     class { '::snapshot::cron::dumplists': user   => $user }
     class { '::snapshot::cron::dump_global_blocks': user   => $user }
     class { '::snapshot::cron::wikidatadumps::json': user   => $user }
diff --git a/modules/snapshot/manifests/cron/categoriesrdf.pp 
b/modules/snapshot/manifests/cron/categoriesrdf.pp
new file mode 100644
index 0000000..87cbdd6
--- /dev/null
+++ b/modules/snapshot/manifests/cron/categoriesrdf.pp
@@ -0,0 +1,36 @@
+class snapshot::cron::categoriesrdf(
+    $user   = undef,
+) {
+    $confsdir = $snapshot::dumps::dirs::confsdir
+
+    file { '/var/log/categoriesrdf':
+        ensure => 'directory',
+        mode   => '0644',
+        owner  => $user,
+    }
+
+    logrotate::conf { 'categoriesrdf':
+        ensure => present,
+        source => 'puppet:///modules/snapshot/cron/logrotate.categoriesrdf',
+    }
+
+    $scriptpath = '/usr/local/bin/dumpcategoriesrdf.sh'
+    file { $scriptpath:
+        mode   => '0755',
+        owner  => 'root',
+        group  => 'root',
+        source => 'puppet:///modules/snapshot/cron/dumpcategoriesrdf.sh',
+    }
+
+    cron { 'categoriesrdf-dump':
+        ensure      => 'present',
+        command     => "${scriptpath} --config ${confsdir}/wikidump.conf",
+        environment => 'MAILTO=ops-du...@wikimedia.org',
+        user        => $user,
+        minute      => '0',
+        hour        => '20',
+        weekday     => '6',
+        require     => File[$scriptpath],
+    }
+}
+

-- 
To view, visit https://gerrit.wikimedia.org/r/373354
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Smalyshev <smalys...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to