ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/362462 )

Change subject: batch abstract jobs and do abstracts and stubs in smaller 
queries
......................................................................


batch abstract jobs and do abstracts and stubs in smaller queries

request fewer pages at once, allow abstract jobs to be batchable
and less than the standard number for the other dump steps, on large wikis

also fix a bug with dryrun reporting

Change-Id: I845f8b135dcb9de8e73123697311b7e1c623b6ce
---
M xmldumps-backup/dumps/runner.py
M xmldumps-backup/dumps/xmljobs.py
M xmldumps-backup/xmlabstracts.py
M xmldumps-backup/xmlstubs.py
4 files changed, 18 insertions(+), 10 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py
index 4a46e02..012ed65 100644
--- a/xmldumps-backup/dumps/runner.py
+++ b/xmldumps-backup/dumps/runner.py
@@ -204,6 +204,7 @@
                                              "Extracted page abstracts for 
Yahoo",
                                              
self._get_partnum_todo("abstractsdump"),
                                              self.wiki.db_name,
+                                             
get_int_setting(self.jobsperbatch, "abstractsdump"),
                                              
self.filepart.get_pages_per_filepart_abstract())])
 
         self.append_job_if_needed(RecombineAbstractDump(
@@ -636,7 +637,7 @@
         """
         if self.dryrun:
             self.pretty_print_commands(command_series_list)
-            return 0
+            return 0, None
 
         else:
             commands = CommandsInParallel(command_series_list, 
callback_stderr=callback_stderr,
diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py
index 588fcb4..2b28c46 100644
--- a/xmldumps-backup/dumps/xmljobs.py
+++ b/xmldumps-backup/dumps/xmljobs.py
@@ -209,8 +209,9 @@
 class AbstractDump(Dump):
     """XML dump for Yahoo!'s Active Abstracts thingy"""
 
-    def __init__(self, name, desc, partnum_todo, db_name, parts=False):
+    def __init__(self, name, desc, partnum_todo, db_name, jobsperbatch=None, 
parts=False):
         self._partnum_todo = partnum_todo
+        self.jobsperbatch = jobsperbatch
         self._parts = parts
         if self._parts:
             self._parts_enabled = True
@@ -276,14 +277,20 @@
         # choose the empty variant to pass to buildcommand, it will fill in 
the rest if needed
         output_dfnames = self.list_outfiles_for_build_command(runner.dump_dir)
         dumpname0 = self.list_dumpnames()[0]
-        for dfname in output_dfnames:
-            if dfname.dumpname == dumpname0:
+        wanted_dfnames = [dfname for dfname in output_dfnames if 
dfname.dumpname == dumpname0]
+        if self.jobsperbatch is not None:
+            maxjobs = self.jobsperbatch
+        else:
+            maxjobs = len(wanted_dfnames)
+        for batch in batcher(wanted_dfnames, maxjobs):
+            commands = []
+            for dfname in batch:
                 series = self.build_command(runner, dfname)
                 commands.append(series)
-        error, broken = runner.run_command(commands, 
callback_stderr=self.progress_callback,
-                                           callback_stderr_arg=runner)
-        if error:
-            raise BackupError("error producing abstract dump")
+            error, broken = runner.run_command(commands, 
callback_stderr=self.progress_callback,
+                                               callback_stderr_arg=runner)
+            if error:
+                raise BackupError("error producing abstract dump")
 
     # If the database name looks like it's marked as Chinese language,
     # return a list including Simplified and Traditional versions, so
diff --git a/xmldumps-backup/xmlabstracts.py b/xmldumps-backup/xmlabstracts.py
index 18662d6..7f378d7 100644
--- a/xmldumps-backup/xmlabstracts.py
+++ b/xmldumps-backup/xmlabstracts.py
@@ -59,7 +59,7 @@
 
     do_xml_stream(wikidb, outfiles, command, wikiconf,
                   start, end, dryrun, 'page_id', 'page',
-                  20000, 30000, '</doc>\n')
+                  5000, 10000, '</doc>\n')
 
 
 # fixme must take a list of ouput files and a list of
diff --git a/xmldumps-backup/xmlstubs.py b/xmldumps-backup/xmlstubs.py
index 06a6485..47be3a7 100644
--- a/xmldumps-backup/xmlstubs.py
+++ b/xmldumps-backup/xmlstubs.py
@@ -135,7 +135,7 @@
 
     do_xml_stream(wikidb, outfiles, command, wikiconf,
                   start, end, dryrun, 'page_id', 'page',
-                  5000, 100000, '</page>\n', callback)
+                  5000, 20000, '</page>\n', callback)
 
 
 def usage(message=None):

-- 
To view, visit https://gerrit.wikimedia.org/r/362462
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I845f8b135dcb9de8e73123697311b7e1c623b6ce
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to