ArielGlenn has submitted this change and it was merged. ( )

Change subject: clean up dumps web server rsync to its fallback

clean up dumps web server rsync to its fallback

No need for a python script any more, just rsync the relevant

Move the destination host name out to a profile.

Bug: T179942
Change-Id: I48493292def0c0630d960aaa403d2f87ff59137b
D modules/dumps/files/copying/
A modules/dumps/files/copying/
M modules/dumps/manifests/copying/peers.pp
M modules/profile/manifests/dumps/web/xmldumps_active.pp
4 files changed, 71 insertions(+), 225 deletions(-)

  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/modules/dumps/files/copying/ 
deleted file mode 100644
index 00aba76..0000000
--- a/modules/dumps/files/copying/
+++ /dev/null
@@ -1,218 +0,0 @@
-import sys
-import subprocess
-import socket
-# This file is managed by puppet!
-# puppet:///modules/dumps/copying/
-class Rsyncer(object):
-    def __init__(self, max_bw, dryrun, list_only):
-        self.max_bw = str(max_bw)
-        self.dryrun = dryrun
-        self.list_only = list_only
- = socket.gethostname()
-        self.rsync_args = ["--bwlimit=" + self.max_bw, '-a', '--delete']
-        if self.list_only:
-            self.rsync_args.append("--list-only")
-        else:
-            self.rsync_args.append("-q")
-        self.excludes = ['--exclude=wikidump_*', '--exclude=md5temp.*', 
-    def get_excludes_for_job(self, jobname, host_info):
-        excludes = []
-        for job in host_info:
-            # 'exclude': { 'dir': 'other', 'job': 'public' }
-            if (job != jobname and 'exclude' in host_info[job] and
-                    host_info[job]['exclude']['job'] == jobname):
-                excludes.append(host_info[job]['exclude']['dir'])
-        return excludes
-    def rsync_all(self, host_info):
-        for job in host_info:
-            excludes = self.get_excludes_for_job(job, host_info)
-            hosts = host_info[job]['hosts']
-            if not in hosts:
-                # no rsync job info for this host
-                continue
-            targets = [h for h in hosts if h !=]
-            if not len(targets):
-                # no hosts to rsync to
-                continue
-            if 'primary' in hosts[]:
-                # this host rsyncs everything except a specific list of dirs
-                dir_args = ["--exclude=/" + d.strip('/') + "/"
-                            for d in excludes]
-                for targ in targets:
-                    if 'dirs' in hosts[targ]:
-                        dir_args.extend(["--exclude=/" + d.strip('/') + "/"
-                                         for d in hosts[targ]['dirs']])
-            elif 'dirs' in hosts[]:
-                # this host keeps data in a specific list of dirs and must
-                # rsync those everywhere else
-                dirs_to_include = [d.strip('/')
-                                   for d in hosts[]['dirs']]
-                if not len(dirs_to_include):
-                    # no specific dirs to sync
-                    continue
-                dir_args = ["--include=/" + d + "/" for d in dirs_to_include]
-                dir_args.extend(["--include=/" + d + "/**"
-                                 for d in dirs_to_include])
-                dir_args.append('--exclude=*')
-            else:
-                # not a primary, no specific dirs to sync, do nothing
-                continue
-            self.do_rsync(host_info[job]['source'], host_info[job]['dest'],
-                          targets, dir_args)
-    def do_rsync(self, src, dest, targets, dir_args):
-        for targ in targets:
-            command = ["/usr/bin/pgrep", "-u", "root",
-                       "-f", "%s::%s" % (targ, dest)]
-            try:
-                subprocess.check_output(command)
-                # return code 0 = already running
-                if self.dryrun:
-                    print "would skip rsync to", "%s::%s" % (targ, dest)
-                continue
-            except subprocess.CalledProcessError as err:
-                if err.returncode != 1:
-                    # genuine error
-                    raise
-            command = (["/usr/bin/rsync"] + self.rsync_args + self.excludes +
-                       dir_args + [src, "%s::%s" % (targ, dest)])
-            if self.dryrun:
-                print "would run", " ".join(command)
-            else:
-                output = None
-                try:
-                    output = subprocess.check_output(command)
-                except subprocess.CalledProcessError:
-                    # fixme might want to do something with error output
-                    pass
-                if output:
-                    if self.list_only:
-                        print output
-                    else:
-                        command = ["/usr/bin/mail", '-E', '-s',
-                                   "DUMPS RSYNC " +,
-                                   'ops-dumps' + '@' + 'wikimedia' + '.org']
-                        proc = subprocess.Popen(command, stdin=subprocess.PIPE)
-                        (out_unused, errs) = proc.communicate(input=output)
-                        if errs:
-                            # give up and hope something else sees this
-                            print errs
-def usage(message):
-    if message:
-        sys.stderr.write(message + "\n")
-        help_message = """Usage: [dryrun] [bw=number] [list]
-    dryrun -- show commands that would be run instead of runnning them
-    bw     -- cap rsync bandwidth to this number (default: 40000)
-    list   -- only list files that would be transferred instead of sending them
-        sys.stderr.write(help_message)
-        sys.exit(1)
-def do_main():
-    dryrun = False
-    list_only = False
-    max_bandwidth = 40000
-    for i in range(1, len(sys.argv)):
-        if sys.argv[i] == 'dryrun':
-            dryrun = True
-        elif sys.argv[i].startswith('bw='):
-            max_bandwidth = sys.argv[i][3:]
-            if not max_bandwidth.isdigit():
-                usage("maxbw must be a number")
-        elif sys.argv[i] == 'list':
-            list_only = True
-        else:
-            usage("unknown option: " + sys.argv[i])
-    rsync = Rsyncer(max_bandwidth, dryrun, list_only)
-    # The rsync commands we would expect to see on...
-    #
-    # Primary for '/public/':
-    #   /usr/bin/rsync -v --bwlimit=40000 -a --delete
-    #          --exclude=wikidump_* --exclude=md5temp.*
-    #          --exclude=/dir-done-by-secondary/
-    #          --exclude=/another-dir-done-by-secondary/
-    #          --exclude=/other/
-    #          /data/xmldatadumps/public/
-    #          remotehost::data/xmldatadumps/public/
-    #
-    # Secondary for '/public/':
-    #   /usr/bin/rsync -v --bwlimit=40000 -a --delete
-    #          --exclude=wikidump_* --exclude=md5temp.*
-    #          --include=/dir-done-by-secondary/
-    #          --include=/another-dir-done-by-secondary/
-    #          --include=/dir-done-by-secndary/**
-    #          --include=/another-dir-done-by-secondary/**
-    #          --exclude=*
-    #          /data/xmldatadumps/public/
-    #          remotehost::data/xmldatadumps/public/
-    #
-    # primary for '/public/other/':
-    #   /usr/bin/rsync -v --bwlimit=40000 -a --delete
-    #          --exclude=wikidump_* --exclude=md5temp.*
-    #          --exclude=/subdir-done-by-secondary/
-    #          --exclude=/another-subdir-done-by-secondary/
-    #          /data/xmldatadumps/public/other/
-    #          remotehost::data/xmldatadumps/public/other/
-    #
-    # secondary for '/public/other/':
-    #   /usr/bin/rsync -v --bwlimit=40000 -a --delete
-    #          --exclude=wikidump_* --exclude=md5temp.*
-    #          --include=/subdir-done-by-secondary/
-    #          --include=/another-dir-done-by-secondary/
-    #          --include=/subdir-done-by-secondary/**
-    #          --include=/another-subdir-done-by-secondary/**
-    #          --exclude=*
-    #          /data/xmldatadumps/public/other/
-    #          remotehost::data/xmldatadumps/public/other/
-    host_info = {
-        'public': {  # job name
-            # source is an absolute path
-            'source': '/data/xmldatadumps/public/',
-            # dest will be prefixed by 'servername::' in rsync
-            'dest': 'data/xmldatadumps/public/',
-            'hosts': {
-                # everything but a specific list of dirs will be pushed:
-                'dataset1001': {'primary': True},
-                # only the specified list of dirs is here:
-                'ms1001': {'dirs': []}
-            }
-        },
-        'other': {   # job name
-            # don't sync this when doing the 'public' job:
-            'exclude': {'dir': 'other', 'job': 'public'},
-            'source': '/data/xmldatadumps/public/other/',
-            'dest': 'data/xmldatadumps/public/other/',
-            'hosts': {
-                'ms1001': {'dirs': []},
-                'dataset1001': {'primary': True}
-            }
-        }
-    }
-    rsync.rsync_all(host_info)
-if __name__ == '__main__':
-    do_main()
diff --git a/modules/dumps/files/copying/ 
new file mode 100644
index 0000000..e312bd8
--- /dev/null
+++ b/modules/dumps/files/copying/
@@ -0,0 +1,60 @@
+# This file is managed by puppet!
+# puppet:///modules/dumps/copying/
+# This script rsyncs xml/sql dumps to fallback web servers or other servers
+# that host a full copy of dumps and datasets.
+# It expects to be run as root, since it preserves owners and permissions.
+# It will not run if there is already an rsync of some sort running to/from
+# the destination host as the root user, no point in competing for
+# bandwidth. Also no point in running a second copy if this script itself
+# is already running.
+usage() {
+    cat<<EOF
+Usage: $0 --desthost <hostname>
+  --desthost   fqdn of host to which to rsync
+Example: $0 --desthost
+    exit 1
+while [ $# -gt 0 ]; do
+    if [ $1 == "--desthost" ]; then
+        desthost="$2"
+        shift; shift
+    else
+        echo "$0: Unknown option $1" >& 2
+        usage
+    fi
+if [ -z "$desthost" ]; then
+    echo "$0: missing argument --desthost"
+    usage && exit 1
+excludes="--exclude='**bad/' --exclude='**save/' --exclude='**not/' 
--exclude='**temp/' --exclude='**tmp/'--exclude='*.inprog'"
+args="--contimeout=600 --timeout=600 --bwlimit=40000 --delete"
+# /usr/bin/pgrep -f -x /usr/bin/rsync -rlptq $bwlimit 
${sourcehost}::${srcpath} ${destroot}/${destpath}
+isrunning=$( /usr/bin/pgrep -u root -f "/usr/bin/rsync .* ${desthost}::" )
+if [ -n "$isrunning" ]; then
+    exit 0
+# sample command:
+# /usr/bin/rsync --bwlimit=40000 -aq --delete --exclude='**bad/' 
--exclude='**save/' --exclude='**not/' \
+#       --exclude='**temp/' --exclude='**tmp/' --exclude='*.inprog' \
+#       /data/xmldatadumps/public/other/
+/usr/bin/rsync $args -aq $excludes --exclude=/other/ 
/data/xmldatadumps/public/  ${desthost}::data/xmldatadumps/public/
+/usr/bin/rsync $args -aq $excludes /data/xmldatadumps/public/other/ 
diff --git a/modules/dumps/manifests/copying/peers.pp 
index cf9f80d..351e183 100644
--- a/modules/dumps/manifests/copying/peers.pp
+++ b/modules/dumps/manifests/copying/peers.pp
@@ -1,20 +1,22 @@
-class dumps::copying::peers {
-    file { '/usr/local/bin/':
+class dumps::copying::peers(
+    $desthost = undef,
+) {
+    file { '/usr/local/bin/':
         ensure => 'present',
         mode   => '0755',
         owner  => 'root',
         group  => 'root',
-        path   => '/usr/local/bin/',
-        source => 'puppet:///modules/dumps/copying/',
+        path   => '/usr/local/bin/',
+        source => 'puppet:///modules/dumps/copying/'
     cron { 'rsync-dumps':
         ensure  => 'present',
         # filter out error messages about vanishing files, we don't want email 
for that
-        command => '/usr/bin/python /usr/local/bin/ 2>&1 | grep 
-v "vanished" ',
+        command => '/bin/bash /usr/local/bin/ 
--desthost $desthost 2>&1 | grep -v "vanished" ',
         user    => 'root',
         minute  => '0',
         hour    => '*/2',
-        require => File['/usr/local/bin/'],
+        require => File['/usr/local/bin/'],
diff --git a/modules/profile/manifests/dumps/web/xmldumps_active.pp 
index 2fd241d..bffa0a8 100644
--- a/modules/profile/manifests/dumps/web/xmldumps_active.pp
+++ b/modules/profile/manifests/dumps/web/xmldumps_active.pp
@@ -16,7 +16,9 @@
         webgroup         => 'datasets',
     # copy dumps and other datasets to fallback host(s) and to labs
-    class {'::dumps::copying::peers':}
+    class {'::dumps::copying::peers':
+        desthost => '',
+    }
     class {'::dumps::copying::labs':
         labhost   => 'labstore1003.eqiad.wmnet',
         publicdir => $publicdir,

To view, visit
To unsubscribe, visit

Gerrit-MessageType: merged
Gerrit-Change-Id: I48493292def0c0630d960aaa403d2f87ff59137b
Gerrit-PatchSet: 7
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <>
Gerrit-Reviewer: ArielGlenn <>
Gerrit-Reviewer: Volans <>
Gerrit-Reviewer: jenkins-bot <>

MediaWiki-commits mailing list

Reply via email to