Smalyshev has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/373354 )
Change subject: [WIP] Add RDF dumps for categories ...................................................................... [WIP] Add RDF dumps for categories Creates RDF dump for each wiki configured in categories-rdf dblist in other/categoriesrdf/YYMMDD. Keeps old dumps back for 70 days. The dumps are kept in TTL format. Bug: Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5 --- A modules/snapshot/files/cron/dumpcategoriesrdf.sh A modules/snapshot/files/cron/logrotate.categoriesrdf M modules/snapshot/manifests/cron.pp A modules/snapshot/manifests/cron/categoriesrdf.pp 4 files changed, 173 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/54/373354/1 diff --git a/modules/snapshot/files/cron/dumpcategoriesrdf.sh b/modules/snapshot/files/cron/dumpcategoriesrdf.sh new file mode 100755 index 0000000..b128a62 --- /dev/null +++ b/modules/snapshot/files/cron/dumpcategoriesrdf.sh @@ -0,0 +1,125 @@ +#!/bin/bash +############################################################# +# This file is maintained by puppet! +# modules/snapshot/cron/dumpcategoriesrdf.sh +############################################################# +# +# Generate an RDF dump of categories for all wikis in +# categories-rdf list and remove old ones. + +source /usr/local/etc/set_dump_dirs.sh + +usage() { + echo "Usage: $0 [--config <pathtofile>] [--dryrun]" + echo + echo " --config path to configuration file for dump generation" + echo " (default value: ${confsdir}/wikidump.conf" + echo " --dryrun don't run dump, show what would have been done" + exit 1 +} + +configFile="${confsdir}/wikidump.conf" +dryrun="false" +dumpFormat="ttl" +dbList="categories-rdf" + +while [ $# -gt 0 ]; do + if [ $1 == "--config" ]; then + configFile="$2" + shift; shift; + elif [ $1 == "--dryrun" ]; then + dryrun="true" + shift + else + echo "$0: Unknown option $1" + usage + fi +done + +if [ ! -f "$configFile" ]; then + echo "Could not find config file: $configFile" + echo "Exiting..." + exit 1 +fi + +deployDir=$(egrep "^dir=" "$configFile" | mawk -Fdir= '{ print $2 }') +gzip=$(egrep "^gzip=" "$configFile" | mawk -Fgzip= '{ print $2 }') +privateList=$(egrep "^privatelist=" "$configFile" | mawk -Fprivatelist= '{ print $2 }') +publicDir=$(egrep "^public=" "$configFile" | mawk -Fpublic= '{ print $2 }') + +if [ -z "$deployDir" -o -z "$gzip" -o -z "$privateList" -o -z "$publicDir" ]; then + echo "failed to find value of one of the following from config file $configFile:" + echo "gzip: $gzip" + echo "dir: $deployDir" + echo "privatelist: $privateList" + echo "public: $publicDir" + echo "exiting..." + exit 1 +fi + +today=$(date +'%Y%m%d') +targetDirBase="$publicDir/other/categoriesrdf" +targetDir="$targetDirBase/$today" +timestampsDir="$targetDirBase/lastdump" +multiVersionScript="$deployDir/multiversion/MWScript.php" + +# remove old datasets +daysToKeep=70 +cutOff=$(( $(date +%s) - $(( $daysToKeep + 1 )) * 24 * 3600)) +if [ -d "$targetDirBase" ]; then + for folder in $(ls -d -r $targetDirBase/*); do + creationTime=$(date --utc --date="$(basename $folder)" +%s 2>/dev/null) + if [ -n "$creationTime" ]; then + if [ "$cutOff" -gt "$creationTime" ]; then + if [ "$dryrun" == "true" ]; then + echo rm $folder/*.$dumpFormat.gz + echo rmdir $folder + else + rm -f $folder/*.$dumpFormat.gz + rmdir $folder + fi + fi + fi + done +fi + +# create todays folder +if [ "$dryrun" == "true" ]; then + echo mkdir -p "$targetDir" + echo mkdir -p "$timestampsDir" +else + if ! mkdir -p "$targetDir"; then + echo "Can't make output directory: $targetDir" + echo "Exiting..." + exit 1 + fi + if ! mkdir -p "$timestampsDir"; then + echo "Can't make output directory: $timestampsDir" + echo "Exiting..." + exit 1 + fi +fi + +# iterate over configured wikis +/usr/local/bin/expanddblist $dbList | while read wiki; do + # exclude all private wikis + if ! egrep -q "^$wiki$" $privateList; then + filename="$wiki-$today-categories" + targetFile="$targetDir/$filename.$dumpFormat.gz" + tsFile="$timestampsDir/$wiki-categories.last" + if [ "$dryrun" == "true" ]; then + echo "php $multiVersionScript maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2> /var/log/categoriesrdf/$filename.log | $gzip > $targetFile" + echo $today > $tsFile + else + php $multiVersionScript maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2> /var/log/categoriesrdf/$filename.log | $gzip > $targetFile + fi + done + fi +done + + +# Maintain a 'latest' symlink always pointing at the most recently completed dump +if [ "$dryrun" == "false" ]; then + cd "$targetDirBase" + ln -snf "$today" "latest" +fi diff --git a/modules/snapshot/files/cron/logrotate.categoriesrdf b/modules/snapshot/files/cron/logrotate.categoriesrdf new file mode 100644 index 0000000..af0cac5 --- /dev/null +++ b/modules/snapshot/files/cron/logrotate.categoriesrdf @@ -0,0 +1,11 @@ +# This file is managed by puppet +# puppet:///modules/snapshot/cron/logrotate.categoriesrdf +# +/var/log/categoriesrdf/*.log { + daily + compress + delaycompress + missingok + maxage 22 + nocreate +} diff --git a/modules/snapshot/manifests/cron.pp b/modules/snapshot/manifests/cron.pp index 9e8441e..c5a25a6 100644 --- a/modules/snapshot/manifests/cron.pp +++ b/modules/snapshot/manifests/cron.pp @@ -4,6 +4,7 @@ class { '::snapshot::cron::mediaperprojectlists': user => $user } class { '::snapshot::cron::pagetitles': user => $user } class { '::snapshot::cron::cirrussearch': user => $user } + class { '::snapshot::cron::categoriesrdf': user => $user } class { '::snapshot::cron::dumplists': user => $user } class { '::snapshot::cron::dump_global_blocks': user => $user } class { '::snapshot::cron::wikidatadumps::json': user => $user } diff --git a/modules/snapshot/manifests/cron/categoriesrdf.pp b/modules/snapshot/manifests/cron/categoriesrdf.pp new file mode 100644 index 0000000..87cbdd6 --- /dev/null +++ b/modules/snapshot/manifests/cron/categoriesrdf.pp @@ -0,0 +1,36 @@ +class snapshot::cron::categoriesrdf( + $user = undef, +) { + $confsdir = $snapshot::dumps::dirs::confsdir + + file { '/var/log/categoriesrdf': + ensure => 'directory', + mode => '0644', + owner => $user, + } + + logrotate::conf { 'categoriesrdf': + ensure => present, + source => 'puppet:///modules/snapshot/cron/logrotate.categoriesrdf', + } + + $scriptpath = '/usr/local/bin/dumpcategoriesrdf.sh' + file { $scriptpath: + mode => '0755', + owner => 'root', + group => 'root', + source => 'puppet:///modules/snapshot/cron/dumpcategoriesrdf.sh', + } + + cron { 'categoriesrdf-dump': + ensure => 'present', + command => "${scriptpath} --config ${confsdir}/wikidump.conf", + environment => 'MAILTO=ops-du...@wikimedia.org', + user => $user, + minute => '0', + hour => '20', + weekday => '6', + require => File[$scriptpath], + } +} + -- To view, visit https://gerrit.wikimedia.org/r/373354 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits