ArielGlenn has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/362462 )
Change subject: batch abstract jobs and do abstracts and stubs in smaller queries ...................................................................... batch abstract jobs and do abstracts and stubs in smaller queries request fewer pages at once, allow abstract jobs to be batchable and less than the standard number for the other dump steps, on large wikis also fix a bug with dryrun reporting Change-Id: I845f8b135dcb9de8e73123697311b7e1c623b6ce --- M xmldumps-backup/dumps/runner.py M xmldumps-backup/dumps/xmljobs.py M xmldumps-backup/xmlabstracts.py M xmldumps-backup/xmlstubs.py 4 files changed, 18 insertions(+), 10 deletions(-) Approvals: ArielGlenn: Looks good to me, approved jenkins-bot: Verified diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py index 4a46e02..012ed65 100644 --- a/xmldumps-backup/dumps/runner.py +++ b/xmldumps-backup/dumps/runner.py @@ -204,6 +204,7 @@ "Extracted page abstracts for Yahoo", self._get_partnum_todo("abstractsdump"), self.wiki.db_name, + get_int_setting(self.jobsperbatch, "abstractsdump"), self.filepart.get_pages_per_filepart_abstract())]) self.append_job_if_needed(RecombineAbstractDump( @@ -636,7 +637,7 @@ """ if self.dryrun: self.pretty_print_commands(command_series_list) - return 0 + return 0, None else: commands = CommandsInParallel(command_series_list, callback_stderr=callback_stderr, diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py index 588fcb4..2b28c46 100644 --- a/xmldumps-backup/dumps/xmljobs.py +++ b/xmldumps-backup/dumps/xmljobs.py @@ -209,8 +209,9 @@ class AbstractDump(Dump): """XML dump for Yahoo!'s Active Abstracts thingy""" - def __init__(self, name, desc, partnum_todo, db_name, parts=False): + def __init__(self, name, desc, partnum_todo, db_name, jobsperbatch=None, parts=False): self._partnum_todo = partnum_todo + self.jobsperbatch = jobsperbatch self._parts = parts if self._parts: self._parts_enabled = True @@ -276,14 +277,20 @@ # choose the empty variant to pass to buildcommand, it will fill in the rest if needed output_dfnames = self.list_outfiles_for_build_command(runner.dump_dir) dumpname0 = self.list_dumpnames()[0] - for dfname in output_dfnames: - if dfname.dumpname == dumpname0: + wanted_dfnames = [dfname for dfname in output_dfnames if dfname.dumpname == dumpname0] + if self.jobsperbatch is not None: + maxjobs = self.jobsperbatch + else: + maxjobs = len(wanted_dfnames) + for batch in batcher(wanted_dfnames, maxjobs): + commands = [] + for dfname in batch: series = self.build_command(runner, dfname) commands.append(series) - error, broken = runner.run_command(commands, callback_stderr=self.progress_callback, - callback_stderr_arg=runner) - if error: - raise BackupError("error producing abstract dump") + error, broken = runner.run_command(commands, callback_stderr=self.progress_callback, + callback_stderr_arg=runner) + if error: + raise BackupError("error producing abstract dump") # If the database name looks like it's marked as Chinese language, # return a list including Simplified and Traditional versions, so diff --git a/xmldumps-backup/xmlabstracts.py b/xmldumps-backup/xmlabstracts.py index 18662d6..7f378d7 100644 --- a/xmldumps-backup/xmlabstracts.py +++ b/xmldumps-backup/xmlabstracts.py @@ -59,7 +59,7 @@ do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', - 20000, 30000, '</doc>\n') + 5000, 10000, '</doc>\n') # fixme must take a list of ouput files and a list of diff --git a/xmldumps-backup/xmlstubs.py b/xmldumps-backup/xmlstubs.py index 06a6485..47be3a7 100644 --- a/xmldumps-backup/xmlstubs.py +++ b/xmldumps-backup/xmlstubs.py @@ -135,7 +135,7 @@ do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', - 5000, 100000, '</page>\n', callback) + 5000, 20000, '</page>\n', callback) def usage(message=None): -- To view, visit https://gerrit.wikimedia.org/r/362462 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I845f8b135dcb9de8e73123697311b7e1c623b6ce Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: master Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits