ArielGlenn has submitted this change and it was merged. Change subject: dumps: admin script to do cleanup, enter maintenance mode, etc ......................................................................
dumps: admin script to do cleanup, enter maintenance mode, etc this script will: clean up most recent broken jobs of a dump remove lockfiles from interrupted dumps on a host enter/end maintenance mode, request immediate exit after completion of current job, etc. kill with extreme prejudice any dump scripts running on a specific wiki but not the python wrapper script (yet) add a notice message to the index page for the most recent dump of a specific wiki Change-Id: I861b40210f4c018e99a9fbace544b6e68284ac7d --- A xmldumps-backup/dumpadmin.py M xmldumps-backup/dumps/runnerutils.py M xmldumps-backup/worker M xmldumps-backup/worker.py 4 files changed, 714 insertions(+), 1 deletion(-) Approvals: ArielGlenn: Verified; Looks good to me, approved jenkins-bot: Verified diff --git a/xmldumps-backup/dumpadmin.py b/xmldumps-backup/dumpadmin.py new file mode 100644 index 0000000..a39ef0e --- /dev/null +++ b/xmldumps-backup/dumpadmin.py @@ -0,0 +1,709 @@ +''' +run a set of commands in a given order, +given information about how many free slots +the host has for such processes and how many +slots each process takes. +''' +import os +import sys +import getopt +import glob +import socket +import signal +from dumps.utils import RunInfoFile, Chunk +from dumps.runnerutils import Checksummer, Status, NoticeFile, SymLinks +from dumps.jobs import DumpDir +from worker import DumpItemList +from dumps.WikiDump import Wiki, Config + + +def command_has_wiki(pid, wikiname): + ''' + see if the process with the given pid is operating on the + given wiki by checking process command line args + ''' + + if wikiname is None: + # no check needed + return True + + try: + process_command = open("/proc/%s/cmdline" % pid, "r") + except IOError: + # permission or gone, anyways not us + return False + for line in process_command: + if line: + fields = line.split("\x00") + for field in fields: + if field == wikiname or field == "--wiki=" + wikiname: + process_command.close() + return True + process_command.close() + return False + + +def get_job_output_files(wiki, job, dump_item_list): + ''' + return list of output files produced by job + ''' + job_files = [] + for item in dump_item_list: + if item.name() == job: + job_files = item.list_outfiles_for_cleanup(DumpDir(wiki, wiki.db_name)) + break + return job_files + + +def check_process_running(pid): + ''' + see if process with given pid is running + and if we started it. + + it's possible for the process to die or be killed + in the meantime after this returns, what can you do + ''' + try: + os.kill(int(pid), 0) + except OSError: + return False + + try: + process_environ = open("/proc/%s/environ" % pid, "r") + except IOError: + # permission or gone, anyways not us + return False + for line in process_environ: + if line: + fields = line.split("\x00") + for field in fields: + if field == "DUMPS": + process_environ.close() + return True + process_environ.close() + return False + + +def get_lockfile_content(filename): + ''' + return the host running the process + and the pid of the process that created + the lockfile + ''' + with open(filename, "r") as lockfile: + content = lockfile.read() + lines = content.splitlines() + if len(lines) != 1: + return(None, None) + else: + host, pid = lines[0].split(" ", 1) + return(host, pid) + + +def create_file(filename): + ''' + create an empty file + ''' + open(filename, 'a').close() + + +def remove_file(filename): + ''' + remove a file + ''' + try: + os.unlink(filename) + except: + pass + + +class ActionHandler(object): + ''' + methods for all actions, whether on one wiki or on all + ''' + + def __init__(self, actions, message, undo, configfile, + wikiname, dryrun, verbose): + ''' + constructor. + reads configs for every wiki, this might be wasteful + but really how long can it take? even with 1k wikis + ''' + self.verbose = verbose + if not actions and not undo: + if self.verbose: + sys.stderr.write("No actions specified.\n") + return + + self.actions = actions + self.undo = undo + self.dryrun = dryrun + self.wikiname = wikiname + self.configfile = configfile + self.message = message + self.conf = Config(self.configfile) + + if self.wikiname is None: + self.wikilist = self.conf.db_list + else: + self.wikilist = [self.wikiname] + + self.wikiconfs = {} + for wiki in self.wikilist: + self.wikiconfs[wiki] = self.get_wiki_config(wiki) + + def get_wiki_config(self, wikiname): + ''' + parse and return the configuration for a particular wiki + ''' + wikiconf = Config(self.configfile) + wikiconf.parse_conffile_per_project(wikiname) + return wikiconf + + def do_all(self): + ''' + do all actions specified at instantiation time + ''' + self.conf.parse_conffile_globally() + self.do_global_actions() + self.undo_global_actions() + self.do_per_wiki_actions() + self.undo_per_wiki_actions() + + def do_global_actions(self): + ''' + do all actions that either do not + reference a particular wiki (maintenance, + exit) or may run on one or all wikis + ''' + for item in self.actions: + if item == "kill": + self.do_kill() + elif item == "unlock": + self.do_unlock() + elif item == "remove": + self.do_remove() + elif item == "maintenance": + self.do_maintenance() + elif item == "exit": + self.do_exit() + + def do_per_wiki_actions(self): + ''' + do all actions that must reference + only one wiki + ''' + for item in self.actions: + for wiki in self.wikiconfs: + if item == "notice": + self.do_notice(wiki) + + def undo_global_actions(self): + ''' + undo all specified actions that do not + reference a particular wiki + ''' + for item in self.undo: + if item == "maintenance": + self.undo_maintenance() + elif item == "exit": + self.undo_exit() + + def undo_per_wiki_actions(self): + ''' + undo all specified actions that must + reference a particular wiki + ''' + for wiki in self.wikiconfs: + for item in self.undo: + if item == "notice": + self.undo_notice(wiki) + + def get_dump_pids(self): + ''' + get list of pids either for one wiki or for all + which are running dumps; these must have been started by + either the scheduler, the bash wrapper or the worker.py + script. i.e. if a user runs dumpBackups.php by hand + that is not going to be picked up. + + don't rely on lock files, they may have been removed or not created + look up processes with DUMPS environ var set. values: + 'scheduler' (the dumps scheduler) + 'wrapper' (the bash dumps wrapper that runs across all wikis + pid (the worker that runs on one wiki and any processes it spawned) + we want at all costs to avoid hardcoded list of commands + ''' + pids = [] + uid = os.geteuid() + for process_id in os.listdir('/proc'): + if process_id.isdigit(): + # owned by us + puid = os.stat(os.path.join('/proc', process_id)).st_uid + if puid == uid: + # has DUMPS environ var + try: + process_environ = open("/proc/%s/environ" % process_id, "r") + except IOError: + # permission or gone, anyways not us + continue + for line in process_environ: + if line: + fields = line.split("\x00") + for field in fields: + if field.startswith("DUMPS="): + # if no wiki specified for instance, get procs for all + if self.wikiname is None or command_has_wiki(process_id, self.wikiname): + pids.append(process_id) + break + process_environ.close() + return pids + + def do_kill(self): + ''' + kill all dump related processes for the wiki specified + at instantiation or all wikis; good only for processes + started by the scheduler, the bash wrapper script or + the python worker script + ''' + pids = self.get_dump_pids() + if self.dryrun: + print "would kill processes", pids + return + elif self.verbose: + print "killing these processes:", pids + + for pid in pids: + os.kill(int(pid), signal.SIGTERM) + + def do_unlock(self): + ''' + unlock either wiki specified at instantiation or + all wikis, provided they were locked on current host + ''' + lock_info = self.find_dump_lockinfo() + # fixme does this iter over keys? + for wiki in lock_info: + if check_process_running(lock_info[wiki]['pid']): + continue + if self.dryrun: + print "would remove lock", lock_info[wiki]['name'] + else: + if self.verbose: + print "removing lock for", wiki + os.unlink(lock_info[wiki]['filename']) + + def find_failed_dumps_for_wiki(self, wikiname): + ''' + return list of failed jobs for the latest run + for the specified wiki or empty list if there are none + ''' + + failed_jobs = [] + # fixme how is the above a string, shouldn't it be a function? + wiki = Wiki(self.wikiconfs[wikiname], wikiname) + date = wiki.latest_dump() + if date is None: + return [], None + + wiki.set_date(date) + run_info_file = RunInfoFile(wiki, False) + results = run_info_file.get_old_runinfo_from_file() + if not results: + return [], None + + for entry in results: + if entry.status() == "failed": + failed_jobs.append(entry.name()) + return failed_jobs, date + + def find_failed_dumps(self): + ''' + return dict of failed jobs per wiki during most recent run, + skipping over wikis with no failed jobs + ''' + + failed_dumps = {} + for wiki in self.wikilist: + results, date = self.find_failed_dumps_for_wiki(wiki) + if results and date is not None: + failed_dumps[wiki] = {} + failed_dumps[wiki][date] = results + + if self.verbose: + print "failed dumps info:", failed_dumps + return failed_dumps + + def do_remove(self): + ''' + find all failed dump jobs for unlocked wikis + clean them up after getting lock on each one + first, then remove lock + + if a specific wiki was specified at instantiation, + clean up only that wiki + ''' + failed_dumps = self.find_failed_dumps() + for wikiname in failed_dumps: + for date in failed_dumps[wikiname]: + wiki = Wiki(self.wikiconfs[wikiname], wikiname) + wiki.set_date(date) + + try: + wiki.lock() + except: + sys.stderr.write("Couldn't lock %s, can't do cleanup\n" % wikiname) + continue + self.cleanup_dump(wiki, failed_dumps[wikiname][date]) + wiki.unlock() + + def cleanup_dump(self, wiki, failed_jobs): + ''' + for the specified wiki, and the given list + of failed jobs, find all the output files, toss + them, then rebuild: md5sums file, symlinks + into latest dir, dump run info file + ''' + chunk_info = Chunk(wiki, wiki.db_name) + dump_dir = DumpDir(wiki, wiki.db_name) + run_info_file = RunInfoFile(wiki, True) + dump_item_list = DumpItemList(wiki, False, False, False, None, None, + True, chunk_info, None, run_info_file, dump_dir) + if not failed_jobs: + if self.verbose: + print "no failed jobs for wiki", wiki + + for job in failed_jobs: + files = get_job_output_files(wiki, job, dump_item_list.dump_items) + paths = [dump_dir.filename_public_path(fileinfo) for fileinfo in files] + if self.verbose: + print "for job", job, "these are the output files:", paths + for filename in paths: + if self.dryrun: + print "would unlink", filename + else: + try: + os.unlink(filename) + except: + continue + + if self.dryrun: + print "would update dumpruninfo file, checksums file, ", + print "status file, index.html file and symlinks to latest dir" + return + + # need to update status files, dumpruninfo, checksums file + # and latest links. + checksums = Checksummer(wiki, dump_dir, True, False) + html_notice_file = NoticeFile(wiki, "", True) + status = Status(wiki, dump_dir, dump_item_list.dump_items, checksums, + True, False, html_notice_file, None, self.verbose) + if self.verbose: + print "updating status files for wiki", wiki.db_name + status.update_status_files() + run_info_file = RunInfoFile(wiki, True) + if self.verbose: + print "updating dump run info file for wiki", wiki.db_name + run_info_file.save_dump_runinfo_file(dump_item_list.report_dump_runinfo()) + symlinks = SymLinks(wiki, dump_dir, False, False, True) + if self.verbose: + print "updating symlinks for wiki", wiki.db_name + symlinks.cleanup_symlinks() + + def do_maintenance(self): + ''' + create an empty maintenance.txt file + causes the dump runners after the next job + to run no jobs per wiki + and sleep 5 minutes in between each wiki + + this is a global action that affects all wikis + run on the given host + ''' + if self.dryrun: + print "would create maintenance file" + return + elif self.verbose: + print "creating maintenance file" + create_file("maintenance.txt") + + def do_exit(self): + ''' + create an empty exit.txt file; causes the + dump runners to exit after next job + + this is a global action that affects all wikis + run on the given host + ''' + if self.dryrun: + print "would create exit file" + return + elif self.verbose: + print "creating exit file" + create_file("exit.txt") + + def do_notice(self, wikiname): + ''' + create a notice.txt file for the particular wiki for + the most recent run. the contents will appear on its + web page for that dump run + ''' + wiki = Wiki(self.wikiconfs[wikiname], wikiname) + date = wiki.latest_dump() + if date is None: + print "dump never run, not adding notice file for wiki", wikiname + return + + if self.dryrun: + print "would add notice.txt for wiki", wikiname, "date", date + return + elif self.verbose: + print "creating notice file for wiki", wikiname, "date", date + + wiki.set_date(date) + NoticeFile(wiki, self.message, True) + + def undo_maintenance(self): + ''' + remove any maintenance.txt file that may exist, + resumes normal operations + ''' + if self.dryrun: + print "would remove maintenance file" + return + elif self.verbose: + print "removing maintenance file" + remove_file("maintenance.txt") + + def undo_exit(self): + ''' + remove any exit.txt file that may exist, + resumes normal operations + ''' + if self.dryrun: + print "would remove exit file" + return + elif self.verbose: + print "removing exit file" + remove_file("exit.txt") + + def undo_notice(self, wikiname): + ''' + remove any notice.txt file that may exist + for the most current run for the given wiki + ''' + wiki = Wiki(self.wikiconfs[wikiname], wikiname) + date = wiki.latest_dump() + if date is None: + print "dump never run, no notice file to remove for wiki", wikiname + return + + if self.dryrun: + print "would remove notice.txt for wiki", wikiname, "date", date + return + elif self.verbose: + print "removing notice file for wiki", wikiname, "date", date + + wiki.set_date(date) + NoticeFile(wiki, False, True) + + def find_dump_lockinfo(self): + ''' + get host and pid information for lockfiles for the wiki + specified at instantiation or for all wikis + ''' + my_hostname = socket.getfqdn() + + lockfiles = [] + results = {} + if self.wikiname is not None: + path = os.path.join(self.wikiconfs[self.wikiname].private_dir, self.wikiname, "lock") + if os.path.exists(path): + lockfiles = [path] + + else: + lockfiles = glob.glob(os.path.join(self.conf.private_dir, "*", "lock")) + + for filename in lockfiles: + host, pid = get_lockfile_content(filename) + wiki = self.get_wiki_from_lockfilename(filename) + if host == my_hostname: + results[wiki] = {'pid': pid, 'host': host, 'filename': filename} + return results + + def get_wiki_from_lockfilename(self, filename): + ''' + given the full lockfile name, grab the wiki name out of it + and return it + ''' + if filename.endswith("lock"): + filename = filename[:-4] + if filename.startswith(self.conf.private_dir): + filename = filename[len(self.conf.private_dir):] + filename = filename.strip(os.path.sep) + return filename + + +def usage(message=None): + ''' + display a helpful usage message with + an optional introductory message first + ''' + + if message is not None: + sys.stderr.write(message) + sys.stderr.write("\n") + usage_message = """ +Usage: dumpadmin.py --<action> [--<action>...] + [--configfile] [--wiki] [--dryrun] [--verbose] [--help] + + where <action> is one of the following: + + kill (-k) kill all running workers and their children + unlock (-u) unlock all locked wikis that have lock + files created by a process that is no + longer running on the current host + remove (-r) remove all failed wiki jobs from most + recent dump, reset wiki status. This + removes ALL related files, so for a + job that produces 4 pages-article files + but only one is actually bad, it will + remove them all. + maintenance (-m) touch maintenance.txt in cwd, causing + workers to run no wikis and sleep 5 + minutes in between checks to see if + maintenance is done + exit (-e) touch exit.txt in cwd, causing workers + to exit after next job + notice (-n) message supplied will be put into notice + file for the given wiki for the most recent + dump or for all wikis + this notice file is incorporated into + the web page shown to users, once + the page is regenerated (during runs) + + OR + + undo (-U) comma-separated list of 'maintenance', + 'notice', 'exit' + the options specified will be undone + + wiki (-w) run on the specified wiki: default, runs on + all given by the config file + configfile (-c) path to config file + default: wikidump.conf in cwd + dryrun (-d) don't do it but show what would be done + verbose (-v) print many progress messages + help (-h) show this message +""" + sys.stderr.write(usage_message) + sys.exit(1) + + +def check_options(remainder, configfile): + ''' + whine if these options have problems + ''' + if len(remainder) > 0: + usage("Unknown option(s) specified: <%s>" % remainder[0]) + + if not os.path.exists(configfile): + usage("no such file found: " + configfile) + + +def fixup_undo(undo): + ''' + convert comma sep argument into list + ''' + if undo is not None: + undo = [(item).strip() for item in undo.split(",")] + else: + undo = [] + return undo + + +def check_actions(undo, actions): + ''' + make sure no specified action is also in the undo list + ''' + problems = [] + for item in undo: + if item in actions: + problems.append(item) + if problems: + usage("action and undo of action cannot be specified together " + + ", ".join(problems)) + + +def get_action_opt(option): + ''' + return action correspodning to command line option + ''' + action_options = ['kill', 'unlock', 'remove', 'maintenance', 'exit'] + if option.startswith("--"): + option = option[2:] + if option in action_options: + return option + elif option.startswith("-"): + option = option[1:] + for action in action_options: + if action.startswith(option): + return action + + return None + + +def main(): + 'main entry point, does all the work' + + actions = [] + configfile = "wikidump.conf" + dryrun = False + verbose = False + message = None + undo = None + wiki = None + + try: + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "c:n:U:w:kurmedvh", + ["configfile=", "notice=", "no=", "undo=", + "wiki=", "kill", "unlock", "remove", + "maintenance", "exit", "dryrun", + "verbose", "help"]) + except getopt.GetoptError as err: + usage("Unknown option specified: " + str(err)) + + for (opt, val) in options: + if opt in ["-c", "--configfile"]: + configfile = val + elif opt in ["-n", "--notice"]: + actions.append("notice") + message = val + elif opt in ["-U", "--undo"]: + undo = val + elif opt in ["-w", "--wiki"]: + wiki = val + elif opt in ["-d", "--dryrun"]: + dryrun = True + elif opt in ["-v", "--verbose"]: + verbose = True + elif opt in ["-h", "--help"]: + usage('Help for this script\n') + else: + result = get_action_opt(opt) + if result is not None: + actions.append(result) + else: + usage("Unknown option specified: <%s>" % opt) + + check_options(remainder, configfile) + undo = fixup_undo(undo) + check_actions(undo, actions) + + handler = ActionHandler(actions, message, undo, configfile, + wiki, dryrun, verbose) + handler.do_all() + + +if __name__ == '__main__': + main() diff --git a/xmldumps-backup/dumps/runnerutils.py b/xmldumps-backup/dumps/runnerutils.py index 71433ea..0d34820 100644 --- a/xmldumps-backup/dumps/runnerutils.py +++ b/xmldumps-backup/dumps/runnerutils.py @@ -226,7 +226,7 @@ # runs going at once (think en pedia, one finishing up the history, another # starting at the beginning to get the new abstracts and stubs). try: - dumps_in_order = self.wiki.latest_dump(all=True) + dumps_in_order = self.wiki.latest_dump(return_all=True) me_index = dumps_in_order.index(self.wiki.date) # don't wrap around to the newest dump in the list! if me_index > 0: diff --git a/xmldumps-backup/worker b/xmldumps-backup/worker index e16f39d..5e270dc 100755 --- a/xmldumps-backup/worker +++ b/xmldumps-backup/worker @@ -203,6 +203,8 @@ fi } +DUMPS="wrapper" +export DUMPS failures=0 WIKIDUMP_BASE=`dirname "$0"` set_defaults diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py index 0c1d48b..9d4bf80 100644 --- a/xmldumps-backup/worker.py +++ b/xmldumps-backup/worker.py @@ -1100,6 +1100,8 @@ def main(): + os.environ['DUMPS'] = str(os.getpid()) + try: date = None config_file = False -- To view, visit https://gerrit.wikimedia.org/r/234971 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I861b40210f4c018e99a9fbace544b6e68284ac7d Gerrit-PatchSet: 5 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: Hydriz <ad...@alphacorp.tk> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits