[MediaWiki-commits] [Gerrit] dumps: clean up construction of list of possible dump jobs f... - change (operations/dumps)

ArielGlenn (Code Review) Mon, 23 Nov 2015 01:57:08 -0800

ArielGlenn has submitted this change and it was merged.

Change subject: dumps: clean up construction of list of possible dump jobs for 
wiki
......................................................................



dumps: clean up construction of list of possible dump jobs for wiki

move checks for exceptions for jobs to add, out to a single method
this is not the list of jobs that will necessarily be run but
the list of all jobs that would be run for a full dump of the wiki
instead of having all those checks inline in the code

Change-Id: I1a9a61b3ea654b0e4ff80cd00bd843f9e4b554cf
---
M xmldumps-backup/dumps/runner.py
1 file changed, 99 insertions(+), 102 deletions(-)

Approvals:
  ArielGlenn: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py
index 7e52e0b..c4ecdee 100644
--- a/xmldumps-backup/dumps/runner.py
+++ b/xmldumps-backup/dumps/runner.py
@@ -111,21 +111,15 @@
                                         "Data for blocks of IP addresses, 
ranges, and users."),
                            PrivateTable("archive", "archivetable",
                                         "Deleted page and revision data."),
-                           # PrivateTable("updates", "updatestable",
-                           #              "Update dataset for OAI updater 
system."),
                            PrivateTable("logging", "loggingtable",
                                         "Data for various events (deletions, 
uploads, etc)."),
                            PrivateTable("oldimage", "oldimagetable",
                                         "Metadata on prior versions of 
uploaded images."),
-                           # PrivateTable("filearchive", "filearchivetable",
-                           #             "Deleted image data"),
 
                            PublicTable("site_stats", "sitestatstable",
                                        "A few statistics such as the page 
count."),
                            PublicTable("image", "imagetable",
                                        "Metadata on current versions of 
uploaded media/files."),
-                           # PublicTable("oldimage", "oldimagetable",
-                           #            "Metadata on prior versions of 
uploaded media/files."),
                            PublicTable("pagelinks", "pagelinkstable",
                                        "Wiki page-to-page link records."),
                            PublicTable("categorylinks", "categorylinkstable",
@@ -138,9 +132,6 @@
                                        "Wiki external URL link records."),
                            PublicTable("langlinks", "langlinkstable",
                                        "Wiki interlanguage link records."),
-                           # PublicTable("interwiki", "interwikitable",
-                           #            "Set of defined interwiki prefixes " +
-                           #            "and links for this wiki."),
                            PublicTable("user_groups", "usergroupstable", "User 
group assignments."),
                            PublicTable("category", "categorytable", "Category 
information."),
 
@@ -152,10 +143,6 @@
                                        "Name/value pairs for pages."),
                            PublicTable("protected_titles", 
"protectedtitlestable",
                                        "Nonexistent pages that have been 
protected."),
-                           # PublicTable("revision", revisiontable",
-                           #            "Base per-revision data (does not 
include text)."), // safe?
-                           # PrivateTable("text", "texttable",
-                           #            "Text blob storage. May be compressed, 
etc."), // ?
                            PublicTable("redirect", "redirecttable", "Redirect 
list"),
                            PublicTable("iwlinks", "iwlinkstable",
                                        "Interwiki link tracking records"),
@@ -171,18 +158,17 @@
                                         
self._get_partnum_todo("abstractsdump"), self.wiki.db_name,
                                         
self.filepart.get_pages_per_filepart_abstract())]
 
-        if self.filepart.parts_enabled():
-            self.dump_items.append(RecombineAbstractDump(
-                "abstractsdumprecombine", "Recombine extracted page abstracts 
for Yahoo",
-                self.find_item_by_name('abstractsdump')))
+        self.append_job_if_needed(RecombineAbstractDump(
+            "abstractsdumprecombine", "Recombine extracted page abstracts for 
Yahoo",
+            self.find_item_by_name('abstractsdump')))
 
         self.dump_items.append(XmlStub("xmlstubsdump", "First-pass for page 
XML data dumps",
                                        self._get_partnum_todo("xmlstubsdump"),
                                        
self.filepart.get_pages_per_filepart_history()))
-        if self.filepart.parts_enabled():
-            self.dump_items.append(RecombineXmlStub(
-                "xmlstubsdumprecombine", "Recombine first-pass for page XML 
data dumps",
-                self.find_item_by_name('xmlstubsdump')))
+
+        self.append_job_if_needed(RecombineXmlStub(
+            "xmlstubsdumprecombine", "Recombine first-pass for page XML data 
dumps",
+            self.find_item_by_name('xmlstubsdump')))
 
         # NOTE that the filepart thing passed here is irrelevant,
         # these get generated from the stubs which are all done in one pass
@@ -197,15 +183,15 @@
                     self.wiki, self._get_partnum_todo("articlesdump"),
                     self.filepart.get_pages_per_filepart_history(), 
checkpoints,
                     self.checkpoint_file, self.page_id_range))
-        if self.filepart.parts_enabled():
-            self.dump_items.append(
-                RecombineXmlDump(
-                    "articlesdumprecombine",
-                    "<big><b>Recombine articles, templates, media/file 
descriptions, " +
-                    "and primary meta-pages.</b></big>",
-                    "This contains current versions of article content, and is 
" +
-                    "the archive most mirror sites will probably want.",
-                    self.find_item_by_name('articlesdump')))
+
+        self.append_job_if_needed(
+            RecombineXmlDump(
+                "articlesdumprecombine",
+                "<big><b>Recombine articles, templates, media/file 
descriptions, " +
+                "and primary meta-pages.</b></big>",
+                "This contains current versions of article content, and is " +
+                "the archive most mirror sites will probably want.",
+                self.find_item_by_name('articlesdump')))
 
         self.dump_items.append(
             XmlDump("meta-current",
@@ -218,65 +204,60 @@
                     self.filepart.get_pages_per_filepart_history(), 
checkpoints,
                     self.checkpoint_file, self.page_id_range))
 
-        if self.filepart.parts_enabled():
-            self.dump_items.append(
-                RecombineXmlDump(
-                    "metacurrentdumprecombine",
-                    "Recombine all pages, current versions only.",
-                    "Discussion and user pages are included in this complete 
archive. " +
-                    "Most mirrors won't want this extra material.",
-                    self.find_item_by_name('metacurrentdump')))
+        self.append_job_if_needed(
+            RecombineXmlDump(
+                "metacurrentdumprecombine",
+                "Recombine all pages, current versions only.",
+                "Discussion and user pages are included in this complete 
archive. " +
+                "Most mirrors won't want this extra material.",
+                self.find_item_by_name('metacurrentdump')))
 
         self.dump_items.append(
             XmlLogging("Log events to all pages and users."))
 
-        if self._has_flagged_revs:
-            self.dump_items.append(
-                PublicTable("flaggedpages", "flaggedpagestable",
-                            "This contains a row for each flagged article, " +
-                            "containing the stable revision ID, if the lastest 
edit " +
-                            "was flagged, and how long edits have been 
pending."))
-            self.dump_items.append(
-                PublicTable("flaggedrevs", "flaggedrevstable",
-                            "This contains a row for each flagged revision, " +
-                            "containing who flagged it, when it was flagged, " 
+
-                            "reviewer comments, the flag values, and the " +
-                            "quality tier those flags fall under."))
+        self.append_job_if_needed(
+            PublicTable("flaggedpages", "flaggedpagestable",
+                        "This contains a row for each flagged article, " +
+                        "containing the stable revision ID, if the lastest 
edit " +
+                        "was flagged, and how long edits have been pending."))
+        self.append_job_if_needed(
+            PublicTable("flaggedrevs", "flaggedrevstable",
+                        "This contains a row for each flagged revision, " +
+                        "containing who flagged it, when it was flagged, " +
+                        "reviewer comments, the flag values, and the " +
+                        "quality tier those flags fall under."))
 
-        if self._has_wikidata:
-            self.dump_items.append(
-                PublicTable("wb_items_per_site", "wbitemspersitetable",
-                            "For each Wikidata item, this contains rows with 
the " +
-                            "corresponding page name on a given wiki 
project."))
-            self.dump_items.append(
-                PublicTable("wb_terms", "wbtermstable",
-                            "For each Wikidata item, this contains rows with a 
label, " +
-                            "an alias and a description of the item in a given 
language."))
-            self.dump_items.append(
-                PublicTable("wb_entity_per_page", "wbentityperpagetable",
-                            "Contains a mapping of page ids and entity ids, 
with " +
-                            "an additional entity type column."))
-            self.dump_items.append(
-                PublicTable("wb_property_info", "wbpropertyinfotable",
-                            "Contains a mapping of Wikidata property ids and 
data types."))
-            self.dump_items.append(
-                PublicTable("wb_changes_subscription", 
"wbchangessubscriptiontable",
-                            "Tracks which Wikibase Client wikis are using 
which items."))
-            self.dump_items.append(
-                PublicTable("sites", "sitestable",
-                            "This contains the SiteMatrix information from " +
-                            "meta.wikimedia.org provided as a table."))
+        self.append_job_if_needed(
+            PublicTable("wb_items_per_site", "wbitemspersitetable",
+                        "For each Wikidata item, this contains rows with the " 
+
+                        "corresponding page name on a given wiki project."))
+        self.append_job_if_needed(
+            PublicTable("wb_terms", "wbtermstable",
+                        "For each Wikidata item, this contains rows with a 
label, " +
+                        "an alias and a description of the item in a given 
language."))
+        self.append_job_if_needed(
+            PublicTable("wb_entity_per_page", "wbentityperpagetable",
+                        "Contains a mapping of page ids and entity ids, with " 
+
+                        "an additional entity type column."))
+        self.append_job_if_needed(
+            PublicTable("wb_property_info", "wbpropertyinfotable",
+                        "Contains a mapping of Wikidata property ids and data 
types."))
+        self.append_job_if_needed(
+            PublicTable("wb_changes_subscription", 
"wbchangessubscriptiontable",
+                        "Tracks which Wikibase Client wikis are using which 
items."))
+        self.append_job_if_needed(
+            PublicTable("sites", "sitestable",
+                        "This contains the SiteMatrix information from " +
+                        "meta.wikimedia.org provided as a table."))
 
-        if self._has_global_usage:
-            self.dump_items.append(
-                PublicTable("globalimagelinks", "globalimagelinkstable",
-                            "Global wiki media/files usage records."))
+        self.append_job_if_needed(
+            PublicTable("globalimagelinks", "globalimagelinkstable",
+                        "Global wiki media/files usage records."))
 
-        if self._is_wikidata_client:
-            self.dump_items.append(
-                PublicTable("wbc_entity_usage", "wbcentityusagetable",
-                            "Tracks which pages use which Wikidata items or 
properties " +
-                            "and what aspect (e.g. item label) is used."))
+        self.append_job_if_needed(
+            PublicTable("wbc_entity_usage", "wbcentityusagetable",
+                        "Tracks which pages use which Wikidata items or 
properties " +
+                        "and what aspect (e.g. item label) is used."))
 
         self.dump_items.append(
             BigXmlDump(
@@ -291,16 +272,15 @@
                 self.wiki, self._get_partnum_todo("metahistorybz2dump"),
                 self.filepart.get_pages_per_filepart_history(),
                 checkpoints, self.checkpoint_file, self.page_id_range))
-        if self.filepart.parts_enabled() and self.filepart.recombine_history():
-            self.dump_items.append(
-                RecombineXmlDump(
-                    "metahistorybz2dumprecombine",
-                    "Recombine all pages with complete edit history (.bz2)",
-                    "These dumps can be *very* large, uncompressing up to " +
-                    "100 times the archive download size. " +
-                    "Suitable for archival and statistical use, " +
-                    "most mirror sites won't want or need this.",
-                    self.find_item_by_name('metahistorybz2dump')))
+        self.append_job_if_needed(
+            RecombineXmlDump(
+                "metahistorybz2dumprecombine",
+                "Recombine all pages with complete edit history (.bz2)",
+                "These dumps can be *very* large, uncompressing up to " +
+                "100 times the archive download size. " +
+                "Suitable for archival and statistical use, " +
+                "most mirror sites won't want or need this.",
+                self.find_item_by_name('metahistorybz2dump')))
         self.dump_items.append(
             XmlRecompressDump(
                 "meta-history",
@@ -314,16 +294,15 @@
                 self.wiki, self._get_partnum_todo("metahistory7zdump"),
                 self.filepart.get_pages_per_filepart_history(),
                 checkpoints, self.checkpoint_file))
-        if self.filepart.parts_enabled() and self.filepart.recombine_history():
-            self.dump_items.append(
-                RecombineXmlRecompressDump(
-                    "metahistory7zdumprecombine",
-                    "Recombine all pages with complete edit history (.7z)",
-                    "These dumps can be *very* large, uncompressing " +
-                    "up to 100 times the archive download size. " +
-                    "Suitable for archival and statistical use, " +
-                    "most mirror sites won't want or need this.",
-                    self.find_item_by_name('metahistory7zdump'), self.wiki))
+        self.append_job_if_needed(
+            RecombineXmlRecompressDump(
+                "metahistory7zdumprecombine",
+                "Recombine all pages with complete edit history (.7z)",
+                "These dumps can be *very* large, uncompressing " +
+                "up to 100 times the archive download size. " +
+                "Suitable for archival and statistical use, " +
+                "most mirror sites won't want or need this.",
+                self.find_item_by_name('metahistory7zdump'), self.wiki))
         # doing this only for recombined/full articles dump
         if self.wiki.config.multistream_enabled:
             if self.filepart.parts_enabled():
@@ -350,6 +329,24 @@
         else:
             self.old_runinfo_retrieved = False
 
+    def append_job_if_needed(self, job):
+        if job.name().endswith("recombine"):
+            if self.filepart.parts_enabled():
+                if 'metahistory' not in job.name() or 
self.filepart.recombine_history():
+                    self.dump_items.append(job)
+            elif job.name().startswith("wb_") or job.name() == "sitestable":
+                if self._has_wikidata:
+                    self.dump_items.append(job)
+            elif job.name().startswith("flagged"):
+                if self._has_flagged_revs:
+                    self.dump_items.append(job)
+            elif job.name().startswith("global"):
+                if self._has_global_usage:
+                    self.dump_items.append(job)
+            elif job.name().startswith("wbc_"):
+                if self._is_wikidata_client:
+                    self.dump_items.append(job)
+                
     def append_job(self, jobname, job):
         if jobname not in self.skip_jobs:
             self.dump_items.append(job)

-- 
To view, visit https://gerrit.wikimedia.org/r/252132
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1a9a61b3ea654b0e4ff80cd00bd843f9e4b554cf
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] dumps: clean up construction of list of possible dump jobs f... - change (operations/dumps)

Reply via email to