ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/399753 )
Change subject: permit use of 7zip compressed files for prefetch
......................................................................
permit use of 7zip compressed files for prefetch
Bug: T179267
Change-Id: I14d4636c78d81a9bfbf04f7f4c218875fcb870dc
---
M xmldumps-backup/defaults.conf
M xmldumps-backup/dumps/WikiDump.py
M xmldumps-backup/dumps/xmlcontentjobs.py
3 files changed, 73 insertions(+), 42 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/xmldumps-backup/defaults.conf b/xmldumps-backup/defaults.conf
index 222cb2d..109fad6 100644
--- a/xmldumps-backup/defaults.conf
+++ b/xmldumps-backup/defaults.conf
@@ -80,4 +80,5 @@
maxrevs=50000
[misc]
-fixeddumporder=0
\ No newline at end of file
+fixeddumporder=0
+sevenzipprefetch=0
diff --git a/xmldumps-backup/dumps/WikiDump.py
b/xmldumps-backup/dumps/WikiDump.py
index 005f858..77b86f0 100644
--- a/xmldumps-backup/dumps/WikiDump.py
+++ b/xmldumps-backup/dumps/WikiDump.py
@@ -316,6 +316,11 @@
self.conf.add_section('wiki')
self.wiki_dir = self.get_opt_for_proj_or_default("wiki", "dir", 0)
+ if not self.conf.has_section('misc'):
+ self.conf.add_section('misc')
+ self.sevenzip_prefetch = self.get_opt_in_overrides_or_default("misc",
"sevenzipprefetch", 0)
+ self.sevenzip_prefetch = int(self.sevenzip_prefetch, 0)
+
def db_latest_status(self):
'''
return list of tuples for each wiki:
diff --git a/xmldumps-backup/dumps/xmlcontentjobs.py
b/xmldumps-backup/dumps/xmlcontentjobs.py
index 2e21232..7b7956a 100644
--- a/xmldumps-backup/dumps/xmlcontentjobs.py
+++ b/xmldumps-backup/dumps/xmlcontentjobs.py
@@ -198,6 +198,55 @@
pagerange['end'] = None
return pagerange
+ def _find_prefetch_files_from_run(self, runner, date, jobinfo,
+ pagerange, file_ext):
+ """
+ for a given wiki and date, see if there are dump content
+ files lying about that can be used for prefetch to the
+ current job, with the given file extension (might be bz2s
+ or 7zs or whatever) for the given range of pages
+ """
+ dfnames = get_checkpt_files(
+ runner.dump_dir, [jobinfo['dumpname']], self.jobinfo['ftype'],
+ file_ext, date, parts=None)
+ possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames(
+ dfnames, pagerange, date, runner)
+ if len(possible_prefetch_dfnames):
+ return possible_prefetch_dfnames
+
+ # ok, let's check for file parts instead, from any run
+ # (may not conform to our numbering for this job)
+ dfnames = get_reg_files(
+ runner.dump_dir, [jobinfo['dumpname']], jobinfo['ftype'],
+ file_ext, date, parts=True)
+ possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames(
+ dfnames, pagerange, date, runner)
+ if len(possible_prefetch_dfnames):
+ return possible_prefetch_dfnames
+
+ # last shot, get output file that contains all the pages, if there is
one
+ dfnames = get_reg_files(
+ runner.dump_dir, [jobinfo['dumpname']],
+ jobinfo['ftype'], file_ext, date, parts=False)
+ # there is only one, don't bother to check for relevance :-P
+ possible_prefetch_dfnames = dfnames
+ dfnames = []
+ for prefetch_dfname in possible_prefetch_dfnames:
+ if runner.wiki.is_private():
+ possible_path =
runner.dump_dir.filename_private_path(prefetch_dfname, date)
+ else:
+ possible_path =
runner.dump_dir.filename_public_path(prefetch_dfname, date)
+ size = os.path.getsize(possible_path)
+ if size < 70000:
+ runner.debug("small %d-byte prefetch dump at %s, skipping" % (
+ size, possible_path))
+ continue
+ else:
+ dfnames.append(prefetch_dfname)
+ if len(dfnames):
+ return dfnames
+ return None
+
def _find_previous_dump(self, runner, partnum=None):
"""
this finds the content file or files from the first previous
successful dump
@@ -226,46 +275,15 @@
runner.debug("skipping incomplete or failed dump for prefetch
date %s" % date)
continue
- # first check if there are checkpoint files from this run we can
use
- dfnames = get_checkpt_files(
- runner.dump_dir, [self.jobinfo['dumpname']],
self.jobinfo['ftype'],
- self.jobinfo['fext'], date, parts=None)
- possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames(
- dfnames, pagerange, date, runner)
- if len(possible_prefetch_dfnames):
- return possible_prefetch_dfnames
+ # might look first for 7z files, then for bz2,
+ # in any case go through the entire dance for each extension
+ # before giving up and moving to next one
+ for file_ext in self.jobinfo['fexts']:
- # ok, let's check for file parts instead, from any run
- # (may not conform to our numbering for this job)
- dfnames = get_reg_files(
- runner.dump_dir, [self.jobinfo['dumpname']],
self.jobinfo['ftype'],
- self.jobinfo['fext'], date, parts=True)
- possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames(
- dfnames, pagerange, date, runner)
- if len(possible_prefetch_dfnames):
- return possible_prefetch_dfnames
-
- # last shot, get output file that contains all the pages, if there
is one
- dfnames = get_reg_files(
- runner.dump_dir, [self.jobinfo['dumpname']],
- self.jobinfo['ftype'], self.jobinfo['fext'], date, parts=False)
- # there is only one, don't bother to check for relevance :-P
- possible_prefetch_dfnames = dfnames
- dfnames = []
- for prefetch_dfname in possible_prefetch_dfnames:
- if runner.wiki.is_private():
- possible_path =
runner.dump_dir.filename_private_path(prefetch_dfname, date)
- else:
- possible_path =
runner.dump_dir.filename_public_path(prefetch_dfname, date)
- size = os.path.getsize(possible_path)
- if size < 70000:
- runner.debug("small %d-byte prefetch dump at %s, skipping"
% (
- size, possible_path))
- continue
- else:
- dfnames.append(prefetch_dfname)
- if len(dfnames):
- return dfnames
+ dfnames_found = self._find_prefetch_files_from_run(
+ runner, date, self.jobinfo, pagerange, file_ext)
+ if dfnames_found:
+ return dfnames_found
runner.debug("Could not locate a prefetchable dump.")
return None
@@ -304,7 +322,10 @@
else:
partnum_str = ""
if len(sources) > 0:
- source = "bzip2:%s" % (";".join(sources))
+ if sources[0].endswith('7z'):
+ source = "7zip:%s" % (";".join(sources))
+ else:
+ source = "bzip2:%s" % (";".join(sources))
runner.show_runner_state("... building %s %s XML dump, with text
prefetch from %s..." %
(self.jobinfo['subset'], partnum_str,
source))
prefetch = "--prefetch=%s" % (source)
@@ -676,11 +697,15 @@
dfnames_todo = self.make_bitesize_jobs(dfnames_todo,
stub_pageranges)
if self.jobinfo['prefetch']:
+ if runner.wiki.config.sevenzip_prefetch:
+ file_exts = ['7z', self.file_ext]
+ else:
+ file_exts = [self.file_ext]
prefetcher = PrefetchFinder(
self.wiki,
{'name': self.name(), 'desc': self.jobinfo['desc'],
'dumpname': self.get_dumpname(),
- 'ftype': self.file_type, 'fext': self.file_ext,
+ 'ftype': self.file_type, 'fexts': file_exts,
'subset': self.jobinfo['subset']},
{'date': self.jobinfo['prefetchdate'], 'parts': self._parts},
self.verbose)
--
To view, visit https://gerrit.wikimedia.org/r/399753
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I14d4636c78d81a9bfbf04f7f4c218875fcb870dc
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits