ArielGlenn has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/401051 )
Change subject: pep8 for rsyncmedia script ...................................................................... pep8 for rsyncmedia script Change-Id: I85ca822d84ef2466218d6ba830e01178922ad931 --- M xmldumps-backup/unused/rsyncmedia.py 1 file changed, 174 insertions(+), 153 deletions(-) Approvals: ArielGlenn: Looks good to me, approved jenkins-bot: Verified diff --git a/xmldumps-backup/unused/rsyncmedia.py b/xmldumps-backup/unused/rsyncmedia.py index 36c608a..b95fe9d 100644 --- a/xmldumps-backup/unused/rsyncmedia.py +++ b/xmldumps-backup/unused/rsyncmedia.py @@ -1,50 +1,56 @@ -import os, sys, getopt, subprocess +import os +import sys +import getopt from subprocess import Popen, PIPE + +def make_path(dir_list): + dirs = filter(None, dir_list) + if len(dirs) == 0: + return None + elif len(dirs) == 1: + # this is ok even with 'None' + return dirs[0] + else: + return os.path.join(*dirs) + + class Rsyncer(object): - def __init__(self, rsyncHost, remoteBaseDir, outputDir, verbose, dryrun): - self.rsyncHost = rsyncHost - self.remoteBaseDir = remoteBaseDir - self.outputDir = outputDir + def __init__(self, rsync_host, remote_base_dir, output_dir, verbose, dryrun): + self.rsync_host = rsync_host + self.remote_base_dir = remote_base_dir + self.output_dir = output_dir self.verbose = verbose self.dryrun = dryrun - self.dirList = [] + self.dir_list = [] - def makePath(self, dirList): - dirs = filter(None, dirList) - if len(dirs) == 0: - return None - elif len(dirs) == 1: - # this is ok even with 'None' - return dirs[0] - else: - return os.path.join(*dirs) + def do_rsync(self, files_to_do, get_dir_list=False): - def doRsync(self, filesToDo, localPath, getDirList = False): - localdir = self.makePath([ self.outputDir, localPath ]) - - command = [ "rsync", "-rltDp" ] - if getDirList: - if filesToDo: - filesToDoList = filesToDo.split('\n') - if len(filesToDoList) > 1: - sys.stderr.write("refusing to generate wanted dir list for multiple toplevel dirs %s\n" % filesToDo) + command = ["rsync", "-rltDp"] + if get_dir_list: + if files_to_do: + files_to_do_list = files_to_do.split('\n') + if len(files_to_do_list) > 1: + sys.stderr.write("refusing to generate wanted " + "dir list for multiple toplevel dirs %s\n" + % files_to_do) return - # we want the first level of hash dirs (to see what exists, so we can request only those) + # we want the first level of hash dirs (to see what + # exists, so we can request only those) # but we don't want anything below that. - excludeLevels = 3 + filesToDoList[0].count('/') - excludeString = "/*" * excludeLevels - command.extend([ "-f", "- "+ excludeString ]) - command.extend([ "--list-only" ]) - dryrunSaved = self.dryrun + exclude_levels = 3 + files_to_do_list[0].count('/') + exclude_string = "/*" * exclude_levels + command.extend(["-f", "- " + exclude_string]) + command.extend(["--list-only"]) + dryrun_saved = self.dryrun self.dryrun = False # we don't actually change anything with --list-only so run it - if filesToDo: - command.extend([ "--files-from", "-" ]) - if rsyncHost: - command.extend([self.rsyncHost + "::" + self.remoteBaseDir, self.outputDir ]) + if files_to_do: + command.extend(["--files-from", "-"]) + if self.rsync_host: + command.extend([self.rsync_host + "::" + self.remote_base_dir, self.output_dir]) else: # "remote" dir is accessible as a local filesystem - command.extend([ self.remoteBaseDir, self.outputDir ]) + command.extend([self.remote_base_dir, self.output_dir]) # 23 = Partial transfer due to error # 24 = Partial transfer due to vanished source files @@ -54,35 +60,37 @@ # some of each type of error on every single run, log things # but don't bail - if (getDirList): - result, output = self.dirList = self.doCommand(command, filesToDo, [23, 24], displayOutput = False) + if get_dir_list: + result_unused, output = self.dir_list = self.do_command( + command, files_to_do, [23, 24], display_output=False) else: - result, output = self.doCommand(command, filesToDo, [23, 24]) - if getDirList: - self.dryrun = dryrunSaved + result_unused, output = self.do_command(command, files_to_do, [23, 24]) + if get_dir_list: + self.dryrun = dryrun_saved return output - def doCommand(self, command, inputToCommand, returnCodesAllowed, displayOutput = True): + def do_command(self, command, input_to_command, return_codes_allowed, display_output=True): output = None - commandString = " ".join(command) + command_string = " ".join(command) if self.dryrun: sys.stderr.write("would run commmand: ") elif self.verbose: sys.stderr.write("about to run command: ") if self.dryrun or self.verbose: - sys.stderr.write(commandString) - if inputToCommand: - sys.stderr.write("\nwith input: %s" % inputToCommand) + sys.stderr.write(command_string) + if input_to_command: + sys.stderr.write("\nwith input: %s" % input_to_command) sys.stderr.write("\n") if self.dryrun: return 0, output try: error = None - proc = Popen(command, stdin = PIPE, stdout = PIPE, stderr = PIPE) - output, error = proc.communicate(inputToCommand) - if proc.returncode and proc.returncode not in returnCodesAllowed: - sys.stderr.write("command '%s failed with return code %s and error %s\n" % ( command, proc.returncode, error )) + proc = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE) + output, error = proc.communicate(input_to_command) + if proc.returncode and proc.returncode not in return_codes_allowed: + sys.stderr.write("command '%s failed with return code %s " + "and error %s\n" % (command, proc.returncode, error)) # we don't bail here, let the caller decide what to do about it" except: sys.stderr.write("command %s failed\n" % command) @@ -90,7 +98,7 @@ sys.stderr.write("%s\n" % error) # the problem is probably serious enough that we should refuse to do further processing raise - if output and displayOutput: + if output and display_output: print output if error: if error: @@ -105,154 +113,163 @@ self.wtype = wtype self.wikidir = wikidir - def doRsync(self): + def do_rsync(self): if self.wtype == "huge": # do all 256 shards separately - self.doHugeRsync() + self.do_huge_rsync() elif self.wtype == "big": # do the top 16 shards separately - self.doBigRsync() + self.do_big_rsync() else: # do the whole thing at once - self.doNormalRsync() + self.do_normal_rsync() - def getFilesFrom(self, hashdir = None, subdir = None): + def get_files_from(self, hashdir=None, subdir=None): """get list of directories for rsync that will be fed to the "--files-from -" option""" - return self.rsyncer.makePath([ self.wikidir, hashdir, subdir ]) + return make_path([self.wikidir, hashdir, subdir]) - def getLocalPath(self, hashdir = None, subdir = None): - """get the local output path for..""" - return self.rsyncer.makePath([ self.wikidir, hashdir, subdir ]) - - def doHugeRsync(self): + def do_huge_rsync(self): if self.rsyncer.verbose or self.rsyncer.dryrun: sys.stderr.write("doing 256 separate shards for wiki %s\n" % self.wiki) - dirs = ["0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f"] - subdirs = ["0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f"] - for d in dirs: - for s in subdirs: - filesFrom = self.getFilesFrom(d, d+s) - localPath = self.getLocalPath(d, d+s) - self.rsyncer.doRsync(filesFrom, localPath) + dirs = ["0", "1", "2", "3", "4", "5", "6", "7", "8", + "9", "a", "b", "c", "d", "e", "f"] + subdirs = ["0", "1", "2", "3", "4", "5", "6", "7", "8", + "9", "a", "b", "c", "d", "e", "f"] + for dname in dirs: + for subdname in subdirs: + files_from = self.get_files_from(dname, dname+subdname) + self.rsyncer.do_rsync(files_from) # now get the archive dir - for d in dirs: - filesFrom = self.getFilesFrom("archive", d) - localPath = self.getLocalPath("archive", d) - self.rsyncer.doRsync(filesFrom, localPath) + for dname in dirs: + files_from = self.get_files_from("archive", dname) + self.rsyncer.do_rsync(files_from) - def doBigRsync(self): + def do_big_rsync(self): if self.rsyncer.verbose or self.rsyncer.dryrun: sys.stderr.write("doing 16 separate shards for wiki %s\n" % self.wiki) - dirs = [ "0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f","archive"] - for d in dirs: - filesFrom = self.getFilesFrom(d) - localPath = self.getLocalPath(d) - self.rsyncer.doRsync(filesFrom, localPath) + dirs = ["0", "1", "2", "3", "4", "5", "6", "7", "8", + "9", "a", "b", "c", "d", "e", "f", "archive"] + for dname in dirs: + files_from = self.get_files_from(dname) + self.rsyncer.do_rsync(files_from) - def doNormalRsync(self): - # for anything not big or huge, get list of media dirs that the wiki has, this will be the list of dirs we want + def do_normal_rsync(self): + # for anything not big or huge, get list of media dirs that + # the wiki has, this will be the list of dirs we want if self.rsyncer.verbose or self.rsyncer.dryrun: sys.stderr.write("retrieving dir list for wiki %s\n" % self.wiki) - localPath = self.getLocalPath() - dirsFound = self.rsyncer.doRsync(self.wikidir, localPath, getDirList = True) + dirs_found = self.rsyncer.do_rsync( + self.wikidir, get_dir_list=True) # explicitly list the 17 dirs we want - dirsWanted = [ self.rsyncer.makePath([self.wikidir, d]) for d in ["0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f","archive"] ] - # filter out the ones not in dirList, keeps rsync from whining about nonexistent dirs + dirs_wanted = [make_path([self.wikidir, d]) + for d in ["0", "1", "2", "3", "4", "5", "6", "7", "8", + "9", "a", "b", "c", "d", "e", "f", "archive"]] + # filter out the ones not in dirList, keeps rsync from + # whining about nonexistent dirs # format of the returned lines is # drwxrwxr-x 4096 2012/04/06 10:45:34 blahblah/8 - filesFrom = [ f.rsplit(None,1)[1] for f in dirsFound.split('\n') if '/' in f ] - filesFrom = "\n".join([ f for f in filesFrom if f in dirsWanted ]) - if filesFrom: + files_from = [f.rsplit(None, 1)[1] for f in dirs_found.split('\n') if '/' in f] + files_from = "\n".join([f for f in files_from if f in dirs_wanted]) + if files_from: if self.rsyncer.verbose or self.rsyncer.dryrun: sys.stderr.write("doing 1 shard for wiki %s\n" % self.wiki) - self.rsyncer.doRsync(filesFrom, localPath) + self.rsyncer.do_rsync(files_from) else: if self.rsyncer.verbose or self.rsyncer.dryrun: sys.stderr.write("skipping wiki %s, no dirs to sync\n" % self.wiki) -def usage(message = None): +def usage(message=None): if message: sys.stderr.write("%s\n" % message) - sys.stderr.write("Usage: python rsyncmedia.py [--remotehost hostname] --remotedir dirname\n") - sys.stderr.write(" --localdir dirname --wikilist filename\n") - sys.stderr.write(" [--big wiki1,wiki2,...] [--huge wiki3,wiki4,...]\n") - sys.stderr.write(" [--verbose] [--dryrun]\n") - sys.stderr.write("\n") - sys.stderr.write("This script rsyncs media from a primary media host. getting only media\n") - sys.stderr.write("publically available (no deleted images, no data from private wikis)\n") - sys.stderr.write("and skipping thumbs, math, timeline, temp, old and misc other directories\n") - sys.stderr.write("that may have been created over time.\n") - sys.stderr.write("\n") - sys.stderr.write("--remotehost: hostname of the remote host form which we are rsyncing.\n") - sys.stderr.write(" if this option is ommited, the remotedir option is assumed\n") - sys.stderr.write(" to refer to a local filesystem (for example nfs-mounted)\n") - sys.stderr.write("--remotedir: path to point in remote directory in which media for the\n") - sys.stderr.write(" wiki(s) are stored; this path is relative to the rsync root.\n") - sys.stderr.write("--localdir: path to root of local directory tree in which media for\n") - sys.stderr.write(" the wiki(s) will be copied.\n") - sys.stderr.write("--wikilist filename which contains names of the wiki databases and their\n") - sys.stderr.write(" corresponding media upload directories, one wiki per line,\n") - sys.stderr.write(" line, to be rsynced. The wikiname and the directory should be\n") - sys.stderr.write(" separated by a tab character. If '-' is given as the name\n") - sys.stderr.write(" wiki db names and directories will be read from stdin.\n") - sys.stderr.write("--big comma-separated list of wiki db names which have enough media\n") - sys.stderr.write(" that we should rsync them in 16 batches, one per subdir\n") - sys.stderr.write(" instead of all at once.\n") - sys.stderr.write("--huge comma-separated list of wiki db names which have enough media\n") - sys.stderr.write(" that we should rsync them in 256 batches, one per 2nd level\n") - sys.stderr.write(" subdir instead of all at once.\n") - sys.stderr.write("--verbose: print lots of status messages.\n") - sys.stderr.write("--dryrun: don't do the rsync, print what would be done.\n") - sys.stderr.write("wiki name of wikidb for rsync; if specified, this will override\n") - sys.stderr.write(" any file given for 'wikilist'.\n") - sys.exit(1) + usage_message = """ +Usage: python rsyncmedia.py [--remotehost hostname] --remotedir dirname + --localdir dirname --wikilist filename + [--big wiki1,wiki2,...] [--huge wiki3,wiki4,...] + [--verbose] [--dryrun] -def getCommaSepList(text): +This script rsyncs media from a primary media host. getting only media +publically available (no deleted images, no data from private wikis) +and skipping thumbs, math, timeline, temp, old and misc other directories +that may have been created over time. + +--remotehost: hostname of the remote host form which we are rsyncing. + if this option is ommited, the remotedir option is assumed + to refer to a local filesystem (for example nfs-mounted) +--remotedir: path to point in remote directory in which media for the + wiki(s) are stored; this path is relative to the rsync root. +--localdir: path to root of local directory tree in which media for + the wiki(s) will be copied. +--wikilist filename which contains names of the wiki databases and their + corresponding media upload directories, one wiki per line, + line, to be rsynced. The wikiname and the directory should be + separated by a tab character. If '-' is given as the name + wiki db names and directories will be read from stdin. +--big comma-separated list of wiki db names which have enough media + that we should rsync them in 16 batches, one per subdir + instead of all at once. +--huge comma-separated list of wiki db names which have enough media + that we should rsync them in 256 batches, one per 2nd level + subdir instead of all at once. +--verbose: print lots of status messages. +--dryrun: don't do the rsync, print what would be done. + +wiki name of wikidb for rsync; if specified, this will override + any file given for 'wikilist'. +""" + sys.stderr.write(usage_message) + sys.exit(1) + + +def get_comma_sep_list(text): if text: if ',' in text: result = text.split(',') else: - result = [ text ] + result = [text] else: result = [] return result -if __name__ == "__main__": - remoteDir = None - rsyncHost = None - localDir = None + +def do_main(): + remote_dir = None + rsync_host = None + local_dir = None big = None huge = None - wikiListFile = None + wiki_list_file = None verbose = False dryrun = False try: - (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", [ "big=", "huge=", "localdir=", "remotedir=", "remotehost=", "localdir=", "wikilist=", "verbose", "dryrun" ]) - except: + (options, remainder) = getopt.gnu_getopt( + sys.argv[1:], "", ["big=", "huge=", "localdir=", "remotedir=", + "remotehost=", "localdir=", "wikilist=", + "verbose", "dryrun"]) + except Exception: usage("Unknown option specified") for (opt, val) in options: if opt == "--remotedir": - remoteDir = val + remote_dir = val elif opt == "--remotehost": - rsyncHost = val + rsync_host = val elif opt == "--localdir": - localDir = val + local_dir = val elif opt == "--big": big = val elif opt == "--huge": huge = val elif opt == "--wikilist": - wikiListFile = val + wiki_list_file = val elif opt == "--verbose": verbose = True elif opt == "--dryrun": @@ -261,42 +278,46 @@ if len(remainder) > 0: usage("Unknown option specified") - if not remoteDir or not localDir or not wikiListFile: + if not remote_dir or not local_dir or not wiki_list_file: usage("One or more mandatory options missing") - if wikiListFile == "-": - fd = sys.stdin + if wiki_list_file == "-": + fdesc = sys.stdin else: - fd = open(wikiListFile ,"r") - wikiList = [ line.strip() for line in fd ] + fdesc = open(wiki_list_file, "r") + wiki_list = [line.strip() for line in fdesc] - if fd != sys.stdin: - fd.close() + if fdesc != sys.stdin: + fdesc.close() # eg enwiki - bigWikis = getCommaSepList(big) + big_wikis = get_comma_sep_list(big) # eg commonswiki - hugeWikis = getCommaSepList(huge) + huge_wikis = get_comma_sep_list(huge) - rsyncer = Rsyncer(rsyncHost, remoteDir, localDir, verbose, dryrun) + rsyncer = Rsyncer(rsync_host, remote_dir, local_dir, verbose, dryrun) - for winfo in wikiList: + for winfo in wiki_list: # first skip blank lines and comments if not winfo or winfo[0] == '#': continue - if not '\t' in winfo: - sys.stderr.write("unexpected line with no tab in wikilist: %s\n") % winfo + if '\t' not in winfo: + sys.stderr.write("unexpected line with no tab in wikilist: %s\n" % winfo) continue # expect <wikiname>\t<directory> - w, wikidir = winfo.split('\t', 1) + wikiname, wikidir = winfo.split('\t', 1) - if w in hugeWikis: + if wikiname in huge_wikis: wtype = "huge" - elif w in bigWikis: + elif wikiname in big_wikis: wtype = "big" else: wtype = "normal" - rp = RsyncProject(rsyncer, w, wtype, wikidir) - rp.doRsync() + rsync_proj = RsyncProject(rsyncer, wikiname, wtype, wikidir) + rsync_proj.do_rsync() + + +if __name__ == "__main__": + do_main() -- To view, visit https://gerrit.wikimedia.org/r/401051 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I85ca822d84ef2466218d6ba830e01178922ad931 Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits