[MediaWiki-commits] [Gerrit] operations/dumps[ariel]: one-off scripts for fixing up multistream dump mess

ArielGlenn (Code Review) Tue, 24 Oct 2017 05:50:08 -0700

ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/386162 )


Change subject: one-off scripts for fixing up multistream dump mess
......................................................................


one-off scripts for fixing up multistream dump mess

Just in case we ever need them again, hopefully not, here they are.
Also fix some new pylint whines since the last time I was in here,
that prevent jenkins from taking this change.

Change-Id: Iadf6d9d3ab8fc39a89836f08d50fb98f7f12d088
---
A fixups/fixup_hashfiles.py
A fixups/fixup_html.py
A fixups/fixup_recompress_moves.py
A fixups/fixup_report_json.py
M xmldumps-backup/see_master_branch/listmediaperproject.py
M xmldumps-backup/see_master_branch/wikiqueries.py
M xmldumps-backup/tools/fixaborts.py
M xmldumps-backup/tools/mysql2txt.py
8 files changed, 533 insertions(+), 7 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/fixups/fixup_hashfiles.py b/fixups/fixup_hashfiles.py
new file mode 100644
index 0000000..7ad3ec4
--- /dev/null
+++ b/fixups/fixup_hashfiles.py
@@ -0,0 +1,170 @@
+import os
+import sys
+import hashlib
+import json
+
+
+def read_wikis(filepath):
+    "read list of wikis, one per line, from file, return the list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def checksum(filename, htype):
+    "return hash of specified file in string format, using specified hash type"
+    if htype == 'md5':
+        summer = hashlib.md5()
+    else:
+        summer = hashlib.sha1()
+    infhandle = file(filename, "rb")
+    bufsize = 4192 * 32
+    fbuffer = infhandle.read(bufsize)
+    while fbuffer:
+        summer.update(fbuffer)
+        fbuffer = infhandle.read(bufsize)
+    infhandle.close()
+    return summer.hexdigest()
+
+
+def update_hashes_text(hashed_paths, output_file, hash_strings, dryrun):
+    """
+    we expect the file to contain all the existing hashes,
+    we will append to it
+    """
+    if not os.path.exists(output_file):
+        # no file with old hashes. something's wrong, skip.
+        return
+
+    with open(output_file, "r") as fhandle:
+        content = fhandle.read()
+    new_file = output_file + ".new"
+
+    if not dryrun:
+        output_handle = file(new_file, "wt")
+        output_handle.write(content)
+
+    for idx in range(0, len(hashed_paths)):
+        if hashed_paths[idx] in content:
+            # info already present in hash file. skip.
+            continue
+
+        if dryrun:
+            print "would append: '{hsum}  {path}' to".format(
+                hsum=hash_strings[idx], path=hashed_paths[idx]), new_file
+        else:
+            output_handle.write("{hsum}  
{path}\n".format(hsum=hash_strings[idx],
+                                                          
path=hashed_paths[idx]))
+    if not dryrun:
+        output_handle.close()
+
+
+def update_hashes_json(hashed_paths, output_file, hash_strings, htype, dryrun):
+    """
+    we expect the file to contain all the existing hashes,
+    we read it, load the json, add our entry to the dict, convert it
+    back to json and write it back out as new file
+    """
+    if not os.path.exists(output_file):
+        # no file with old hashes. something's wrong, skip.
+        return
+
+    with open(output_file, "r") as fhandle:
+        contents = fhandle.read()
+        output = json.loads(contents)
+
+    new_file = output_file + ".new"
+    if not dryrun:
+        output_handle = file(new_file, "wt")
+
+    for idx in range(0, len(hashed_paths)):
+        output[htype]["files"][hashed_paths[idx]] = hash_strings[idx]
+
+    if dryrun:
+        print "would write: '{outp}' to".format(outp=json.dumps(output)), 
new_file
+    else:
+        output_handle.write(json.dumps(output))
+        output_handle.close()
+
+
+def update_hashes(file_paths, hashes_path, hash_strings, htype, ftype, dryrun):
+    filenames = [os.path.basename(path) for path in file_paths]
+    if ftype == 'txt':
+        update_hashes_text(filenames, hashes_path, hash_strings, dryrun)
+    else:
+        update_hashes_json(filenames, hashes_path, hash_strings, htype, dryrun)
+
+
+def get_hashfile_path(dumpstree, wiki, date, hashtype, filetype):
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    filename = '-'.join([wiki, date, 
'{htype}sums.{ftype}'.format(htype=hashtype, ftype=filetype)])
+    return os.path.join(dumpsdir, filename)
+
+
+def cleanup_hashfiles(wiki, dumpstree, date, filename_bases, dryrun):
+    """
+    For the specified wiki and date, given the base part of the filename,
+    get the md5 and sha1 sums of the corresponding wiki dump file for
+    that date, append these to the plaintext files of hashes and write
+    out new files.
+
+    Also write new json files of hashes to include this information;
+    these values will overwrite old values if present.
+    """
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        print "skipping this wiki", dumpsdir
+        return
+
+    filenames = ['-'.join([wiki, date, base]) for base in filename_bases]
+    file_paths = [os.path.join(dumpsdir, filename) for filename in filenames]
+    file_paths = [path for path in file_paths if os.path.exists(path)]
+    for htype in ['md5', 'sha1']:
+        for ftype in ['txt', 'json']:
+            hashes_path = get_hashfile_path(dumpstree, wiki, date, htype, 
ftype)
+            hash_strings = [checksum(filename, htype) for filename in 
file_paths]
+            update_hashes(file_paths, hashes_path, hash_strings, htype, ftype, 
dryrun)
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Adds md5sum and sha1sum of multistream content and index files
+to the plaintext files and the json files with hash lists.
+
+The new files are created with the extension '.new' at the end.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+def do_main(alldbs, dumpstree, date, filename_bases, dryrun):
+    "main entry point"
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_hashfiles(wiki, dumpstree, date, filename_bases, dryrun)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    # dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    # publicdir = '/home/ariel/dumptesting/dumpruns/public'
+
+    basenames = ['pages-articles-multistream-index.txt.bz2',
+                 'pages-articles-multistream.xml.bz2']
+    do_main(dblist, publicdir, sys.argv[1], basenames,
+            dryrun=True if len(sys.argv) == 3 else False)
diff --git a/fixups/fixup_html.py b/fixups/fixup_html.py
new file mode 100644
index 0000000..5281592
--- /dev/null
+++ b/fixups/fixup_html.py
@@ -0,0 +1,132 @@
+import os
+import sys
+
+
+def read_wikis(filepath):
+    "read list of wikis from file, one per line, and return list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def pretty_size(size, quanta):
+    "return size of file scaled down as much as possible."
+    if size < 1024 or len(quanta) == 1:
+        return quanta[0] % size
+    else:
+        return pretty_size(size / 1024.0, quanta[1:])
+
+
+def get_printable_size(filepath):
+    "return size of file with nice human readable format"
+    quanta = ("%d bytes", "%d KB", "%0.1f MB", "%0.1f GB", "%0.1f TB")
+    size = os.path.getsize(filepath)
+    return pretty_size(size, quanta)
+
+
+def get_new_html(multistream_name, multistr_index_name,
+                 multistream_path, multistr_index_path,
+                 html_path):
+    """
+    read old html content, fix up the lines that are missing info
+    for the multistream content and index files, return the new
+    content
+    """
+    with open(html_path, "r") as fhandle:
+        contents = fhandle.read()
+        lines = contents.splitlines()
+
+    new_lines = []
+    for line in lines:
+        if 'pages-articles-multistream.xml' in line:
+            line = line.replace(
+                "<li class='missing'>",
+                "<li class='file'>" + '<a 
href="{path}">'.format(path=multistream_name))
+            line = line.replace(
+                "stream.xml.bz2</li>",
+                "stream.xml.bz2</a> {size} 
</li>".format(size=get_printable_size(multistream_path)))
+        elif 'pages-articles-multistream-index.txt' in line:
+            line = line.replace(
+                "<li class='missing'>",
+                "<li class='file'>" + '<a 
href="{path}">'.format(path=multistr_index_name))
+            line = line.replace(
+                "index.txt.bz2</li>",
+                "index.txt.bz2</a> {size} </li>".format(
+                    size=get_printable_size(multistr_index_path)))
+        new_lines.append(line)
+    return new_lines
+
+
+def cleanup_html(wiki, dumpstree, date, dryrun):
+    """
+    add size and link for content and index multistream files
+    to index.html file for the dump of the given wiki and date,
+    writing out a new file.
+    """
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        return
+    multistream_name = '-'.join([wiki, date, 
'pages-articles-multistream.xml.bz2'])
+    multistr_index_name = '-'.join([wiki, date, 
'pages-articles-multistream-index.txt.bz2'])
+
+    multistream_path = os.path.join(dumpsdir, multistream_name)
+    multistr_index_path = os.path.join(dumpsdir, multistr_index_name)
+
+    html_path = os.path.join(dumpsdir, 'index.html')
+    lines = get_new_html(multistream_name, multistr_index_name,
+                         multistream_path, multistr_index_path,
+                         html_path)
+
+    new_file = html_path + '.new'
+    if dryrun:
+        print "would write lines to {out}:".format(out=new_file)
+        for line in lines:
+            if 'pages-articles-multistream' in line:
+                print line
+    else:
+        output = '\n'.join(lines) + '\n'
+        output_handle = file(new_file, "wt")
+        output_handle.write(output)
+        output_handle.close()
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Add link and size of multistream content and index files to index.html
+for all wikis for the given date.
+Writes new html files into a temporary location 'index.html.new'.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+def do_main(alldbs, dumpstree, date, dryrun):
+    "entry point"
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_html(wiki, dumpstree, date, dryrun)
+
+
+if __name__ == '__main__':
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    # dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    # publicdir = '/home/ariel/dumptesting/dumpruns/public'
+
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    do_main(dblist,
+            publicdir,
+            date=sys.argv[1], dryrun=True if len(sys.argv) == 3 else False)
diff --git a/fixups/fixup_recompress_moves.py b/fixups/fixup_recompress_moves.py
new file mode 100644
index 0000000..eae4dba
--- /dev/null
+++ b/fixups/fixup_recompress_moves.py
@@ -0,0 +1,126 @@
+import os
+import sys
+from subprocess import Popen
+
+
+def read_wikis(filepath):
+    "read list of wikis, one per line, from file and return the list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def compress(input_path, output_path, dryrun):
+    """
+    returns True on success, False on failure
+    """
+    command = "/bin/bzip2 -zc {inp} > {out}".format(
+        inp=input_path, out=output_path)
+    if dryrun:
+        print "would run", command
+        return True
+    try:
+        proc = Popen(command, shell=True)
+        _output, error = proc.communicate()
+    except Exception:
+        # fixme display the issue too
+        return False
+
+    if error is not None:
+        print error
+        return False
+    else:
+        return True
+
+
+def is_compressed(path):
+    """
+    check if the file is bz2 compressed
+    return True if so, False otherwise
+    """
+    with open(path) as fhandle:
+        header = fhandle.read(7)
+        return bool(header.startswith("BZh91AY"))
+
+
+def cleanup_multistreams(wiki, dumpstree, date, dryrun):
+    """
+    for the specified wiki, if there is a multistream
+    content file with temp filename, move it into the
+    permanent location; if there is a multistream index
+    file with temp filename, bzip2 compress it into the
+    permanent location
+    """
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        return
+    multistream_name = '-'.join([wiki, date, 
'pages-articles-multistream.xml.bz2'])
+    index_name = '-'.join([wiki, date, 
'pages-articles-multistream-index.txt.bz2'])
+    extension = '.inprog'
+    multistream_path = os.path.join(dumpsdir, multistream_name)
+    index_path = os.path.join(dumpsdir, index_name)
+    if os.path.exists(multistream_path + extension):
+        if dryrun:
+            print "would rename", multistream_path + extension, "to", 
multistream_path
+        else:
+            os.rename(multistream_path + extension, multistream_path)
+    if os.path.exists(index_path + extension):
+        if os.path.exists(index_path):
+            print "target file ", index_path, "already exists, skipping"
+        else:
+            if is_compressed(index_path + extension):
+                # don't compress, just move into place
+                if dryrun:
+                    print "would rename", index_path + extension, "to", 
index_path
+                else:
+                    os.rename(index_path + extension, index_path)
+            elif compress(index_path + extension, index_path, dryrun):
+                if dryrun:
+                    print "would remove", index_path + extension
+                else:
+                    os.unlink(index_path + extension)
+
+
+def do_main(alldbs, dumpstree, date, dryrun):
+    """
+    entry point. for all wikis in the list, for the dump date specified
+    by date (YYYYMMDD), fix up the articles multistream content and
+    index file in the subdir wiki/date under the specified dumpstree.
+    """
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_multistreams(wiki, dumpstree, date, dryrun)
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Moves multistream content file from temp to permanent location;"
+Bzip2 compresses index file into permanent location and removes"
+temp file.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    # dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    # publicdir = '/home/ariel/dumptesting/dumpruns/public'
+    do_main(dblist,
+            publicdir,
+            date=sys.argv[1], dryrun=True if len(sys.argv) == 3 else False)
diff --git a/fixups/fixup_report_json.py b/fixups/fixup_report_json.py
new file mode 100644
index 0000000..9c0608a
--- /dev/null
+++ b/fixups/fixup_report_json.py
@@ -0,0 +1,86 @@
+import os
+import sys
+import json
+
+
+def read_wikis(filepath):
+    "read list of wkis from file, one per line, and return the list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def cleanup_report_json(wiki, dumpstree, date, dryrun):
+    """add size and relative url for multistream content and index files
+    to contents of report.json, and write out a new file."""
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        print "skipping this wiki:", wiki
+        return
+    multistream_name = '-'.join([wiki, date, 
'pages-articles-multistream.xml.bz2'])
+    index_name = '-'.join([wiki, date, 
'pages-articles-multistream-index.txt.bz2'])
+
+    multistream_path = os.path.join(dumpsdir, multistream_name)
+    index_path = os.path.join(dumpsdir, index_name)
+
+    report_json_path = os.path.join(dumpstree, wiki, date, 'report.json')
+    with open(report_json_path, "r") as fhandle:
+        contents = fhandle.read()
+        output = json.loads(contents)
+
+    if os.path.exists(multistream_path):
+        output['jobs']['articlesmultistreamdump']['files'][multistream_name] = 
{
+            'size': os.path.getsize(multistream_path),
+            'url': os.path.join('/', wiki, date, multistream_name)}
+    if os.path.exists(index_path):
+        output['jobs']['articlesmultistreamdump']['files'][index_name] = {
+            'size': os.path.getsize(index_path),
+            'url': os.path.join('/', wiki, date, index_name)}
+
+    new_file = report_json_path + '.new'
+    if dryrun:
+        print "would write '{inp}' to".format(inp=json.dumps(output)), new_file
+    else:
+        output_handle = file(new_file, "w")
+        output_handle.write(json.dumps(output))
+        output_handle.close()
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Adds information about the multistream content file and the
+index file to report.json, writing a new temp file.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+def do_main(alldbs, dumpstree, date, dryrun):
+    "main entry point"
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_report_json(wiki, dumpstree, date, dryrun)
+
+
+if __name__ == '__main__':
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    # dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    # publicdir = '/home/ariel/dumptesting/dumpruns/public'
+
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    do_main(dblist, publicdir, date=sys.argv[1],
+            dryrun=True if len(sys.argv) == 3 else False)
diff --git a/xmldumps-backup/see_master_branch/listmediaperproject.py 
b/xmldumps-backup/see_master_branch/listmediaperproject.py
index c13fa65..20d303f 100644
--- a/xmldumps-backup/see_master_branch/listmediaperproject.py
+++ b/xmldumps-backup/see_master_branch/listmediaperproject.py
@@ -6,6 +6,9 @@
 from dumps.WikiDump import Config
 
 
+# pylint: disable=W0703
+
+
 def get_file_name_format(phase):
     return "{w}-{d}-" + phase + "-wikiqueries.gz"
 
@@ -68,7 +71,7 @@
                 print ("command '%s failed with return code %s and error %s"
                        % (command, proc.returncode, error))
                 sys.exit(1)
-        except:
+        except Exception:
             print "command %s failed" % command
             raise
 
@@ -138,7 +141,7 @@
             "outputdir=", "remotereponame=", "wqconfig=", "wqpath=",
             "remoteonly", "localonly",
             "nooverwrite", "verbose"])
-    except:
+    except Exception:
         usage("Unknown option specified")
 
     for (opt, val) in options:
diff --git a/xmldumps-backup/see_master_branch/wikiqueries.py 
b/xmldumps-backup/see_master_branch/wikiqueries.py
index 1473f52..2df3d47 100644
--- a/xmldumps-backup/see_master_branch/wikiqueries.py
+++ b/xmldumps-backup/see_master_branch/wikiqueries.py
@@ -14,6 +14,9 @@
 from dumps.fileutils import FileUtils
 
 
+# pylint: disable=W0703
+
+
 class WQDbServerInfo(DbServerInfo):
     def build_sql_command_tofile(self, query, out_file):
         """Put together a command to execute an sql query
@@ -61,7 +64,7 @@
                 if not self.dryrun:
                     if not self.run_wiki_query():
                         return False
-            except:
+            except Exception:
                 if self.verbose:
                     traceback.print_exc(file=sys.stdout)
                 return False
@@ -173,7 +176,7 @@
             sys.argv[1:], "", ['configfile=', "date=", 'filenameformat=',
                                "outdir=", "query=", "retries=", 'dryrun',
                                "nooverwrite", 'verbose'])
-    except:
+    except Exception:
         usage("Unknown option specified")
 
     for (opt, val) in options:
diff --git a/xmldumps-backup/tools/fixaborts.py 
b/xmldumps-backup/tools/fixaborts.py
index a89ebf1..2962d81 100644
--- a/xmldumps-backup/tools/fixaborts.py
+++ b/xmldumps-backup/tools/fixaborts.py
@@ -11,6 +11,9 @@
 import getopt
 
 
+# pylint: disable=W0703
+
+
 def usage(message=None):
     '''
     display a helpful usage message
@@ -105,7 +108,7 @@
         # fixme does this close automagically later?
         contents = open(path, "r").read()
         return contents
-    except:
+    except Exception:
         return None
 
 
@@ -229,7 +232,7 @@
     '''
     try:
         dates = os.listdir(os.path.join(dumpdir, wiki))
-    except:
+    except Exception:
         return None
     if not dates:
         return None
diff --git a/xmldumps-backup/tools/mysql2txt.py 
b/xmldumps-backup/tools/mysql2txt.py
index f491b06..50ac5a3 100644
--- a/xmldumps-backup/tools/mysql2txt.py
+++ b/xmldumps-backup/tools/mysql2txt.py
@@ -7,6 +7,9 @@
 import sys
 
 
+# pylint: disable=W0703
+
+
 class ConverterError(Exception):
     pass
 
@@ -365,7 +368,7 @@
     try:
         (options, remainder) = getopt.gnu_getopt(
             sys.argv[1:], "", ['table=', 'columns=', 'values=', 'separator='])
-    except:
+    except Exception:
         usage("Unknown option specified")
 
     for (opt, val) in options:

-- 
To view, visit https://gerrit.wikimedia.org/r/386162
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Iadf6d9d3ab8fc39a89836f08d50fb98f7f12d088
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] operations/dumps[ariel]: one-off scripts for fixing up multistream dump mess

Reply via email to