ArielGlenn has submitted this change and it was merged.

Change subject: refactor generation of per project media lists
......................................................................


refactor generation of per project media lists

* use globalimagelinks table for remote media lists
* fix running on one wiki to actually only run on that one
* toss all the redirect and dict crap, unneeded now
* rename functions to what they do (writeLocalMedia,
  writeRemoteMedia), clean up option names, usage

Change-Id: Ifa8c877de7b64a0cb84b0a0f326836e78feca8dd
---
M xmldumps-backup/wikiqueries/wmfgetremoteimages.py
1 file changed, 62 insertions(+), 215 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved



diff --git a/xmldumps-backup/wikiqueries/wmfgetremoteimages.py 
b/xmldumps-backup/wikiqueries/wmfgetremoteimages.py
index c491a89..7524323 100644
--- a/xmldumps-backup/wikiqueries/wmfgetremoteimages.py
+++ b/xmldumps-backup/wikiqueries/wmfgetremoteimages.py
@@ -1,24 +1,17 @@
 import os
 import sys
-import re
-import gzip
 import getopt
 import time
-import ConfigParser
-import subprocess
-import codecs
 from subprocess import Popen, PIPE
 from wikiqueries import Config
 
 
 class MediaPerProject(object):
-    def __init__(self, conf, outputDir, remoteRepoName, reuseRepoDate,
-                 verbose, wqConfigFile, wqPath, overwrite):
+    def __init__(self, conf, outputDir, remoteRepoName,
+                 verbose, wqConfigFile, wqPath, overwrite, wiki=None):
         self.conf = conf
         self.outputDir = outputDir
-        self.reuseRepoDate = reuseRepoDate
         self.remoteRepoName = remoteRepoName
-        self.remoteRepoMediaDict = None
         self.verbose = verbose
         self.date = time.strftime("%Y%m%d", time.gmtime())
         self.fileNameFormat = "{w}-{d}-wikiqueries.gz"
@@ -27,35 +20,27 @@
         self.overwrite = overwrite
         if not os.path.exists(outputDir):
             os.makedirs(outputDir)
-        self.wikisToDo = [w for w in self.conf.allWikisList
-                          if w not in self.conf.privateWikisList and
-                          w not in self.conf.closedWikisList]
+        if wiki is not None:
+            self.wikisToDo = [wiki]
+        else:
+            self.wikisToDo = [w for w in self.conf.allWikisList
+                              if w not in self.conf.privateWikisList and
+                              w not in self.conf.closedWikisList]
 
     def getFileNameFormat(self, phase):
         return "{w}-{d}-" + phase + "-wikiqueries.gz"
 
-    def generateSqlFiles(self):
+    def writeLocalMedia(self):
         if self.verbose:
-            print "Starting round one wikiqueries for imagelinks table"
-        self.doWikiQueries('select distinct il_to from imagelinks',
-                           self.getFileNameFormat("links"))
+            print "Starting round one wikiqueries for image table"
+        if len(self.wikisToDo) == 1:
+            wiki = self.wikisToDo[0]
+        else:
+            wiki = None
+        self.doWikiQueries('select img_name, img_timestamp from image',
+                           self.getFileNameFormat("local"), wiki)
         if self.verbose:
             print "Done round one!!"
-            print "Starting round two wikiqueries for image table"
-        self.doWikiQueries('select img_name, img_timestamp from image',
-                           self.getFileNameFormat("local"))
-        if self.verbose:
-            print "Done round two!!"
-        if self.verbose:
-            print "Starting round three wikiquries for remote redirs"
-        self.doWikiQueries("select p.page_title, r.rd_title from"
-                           " redirect as r, page as p where"
-                           " rd_namespace = 6 and p.page_id = r.rd_from"
-                           " and page_namespace = 6",
-                           self.getFileNameFormat("redirs"),
-                           self.remoteRepoName)
-        if self.verbose:
-            print "Done round three!!"
 
     def doWikiQueries(self, query, fileNameFormat, wiki=None):
         if not os.path.exists(wqConfigFile):
@@ -76,169 +61,32 @@
             print "About to run wikiqueries:", commandString
         try:
             proc = Popen(command, stderr=PIPE)
-            output, error = proc.communicate()  # no output, ignore it
+            output_unused, error = proc.communicate()
             if proc.returncode:
                 print ("command '%s failed with return code %s and error %s"
-                       % (command, proc.returncode,  error))
+                       % (command, proc.returncode, error))
                 sys.exit(1)
         except:
             print "command %s failed" % command
             raise
 
-    def writeRemoteHostedMediaList(self, db, remoteRepoMediaDict):
-        if db == self.remoteRepoName:
-            if self.verbose:
-                print "Skipping", db, "because it's the remote repo"
-            return
+    def writeRemoteMedia(self):
         if self.verbose:
-            print "Doing db", db
+            print "Starting round two wikiqueries for global image links table"
 
-        # get all media used on a project and remotely stored (or
-        # they just don't exist; links to nonexistent media still
-        # go into the links table so we'll get those too)
-
-        if not os.path.exists(self.getPath(self.getMediaLinksFileName(db))):
-            if self.verbose:
-                print "Skipping", db,
-                print "since sql files for it were not generated"
-            return
-
-        # links may have initial lowercase; media titles are
-        # all initial uppercase.
-
-        # can't use codecs.getreader()gzip.open()) because it will
-        # find '\n' in multibyte chars
-        mediaLinksFd = gzip.open(self.getPath(
-            self.getMediaLinksFileName(db)), "rb")
-        mediaLinks = filter(None, [self.safeDecode(line)
-                                   for line in mediaLinksFd])
-        mediaLinks = [line[0].upper() + line.strip()[1:]
-                      for line in mediaLinks]
-        mediaLinksFd.close()
-
-        localMediaFd = gzip.open(self.getPath(
-            self.getLocalMediaFileName(db)), "rb")
-        localMediaDict = self.getMediaDict(localMediaFd)
-        localMediaFd.close()
-
-        remoteHostedMedia = [m for m in mediaLinks if m not in localMediaDict]
-
-        # replace all the remoteHosted entries that we find in the remote redir
-        # list with the titles of the redirect targets.
-        remoteMediaRedirsFd = gzip.open(self.getPath(
-            self.getremoteMediaRedirsFileName(self.date)), "rb")
-        remoteMediaRedirsDict = self.getMediaRedirsDict(remoteMediaRedirsFd)
-        remoteMediaRedirsFd.close()
-
-        remoteHostedMediaNoRedirs = set([remoteMediaRedirsDict[f]
-                                         if f in remoteMediaRedirsDict
-                                         else f for f in remoteHostedMedia])
-        remoteHostedMediaExists = [m for m in remoteHostedMediaNoRedirs
-                                   if m in remoteRepoMediaDict]
-
-        outFd = codecs.getwriter("utf-8")(gzip.open(
-            self.getPath(self.getRemoteMediaFileName(db)), "wb"))
-        for f in remoteHostedMediaExists:
-            outFd.write("%s\t%s\n" % (f, self.remoteRepoMediaDict[f]))
-        outFd.close()
-        if self.verbose:
-            print "Done!"
-
-    def safeDecode(self, line):
-        try:
-            line = line.strip().decode("utf-8")
-        except UnicodeDecodeError:
-            print "unicode decode failed, line is", line
-            line = None
-        return line
-
-    def getMediaRedirsDict(self, RedirsFd):
-        redirsDict = {}
-        for line in RedirsFd:
-            line = self.safeDecode(line)
-            fromTitle, toTitle = self.getSqlFields(line, 2)
-            if fromTitle and toTitle:
-                redirsDict[fromTitle] = toTitle
-        return redirsDict
-
-    # sure this is functionally identical to getmediaredirsdict
-    # but we may want more fields in here later so it's split out
-    def getMediaDict(self, mediaFd):
-        mediaDict = {}
-        for line in mediaFd:
-            line = self.safeDecode(line)
-            title, timestamp = self.getSqlFields(line, 2)
-            if title and timestamp:
-                mediaDict[title] = timestamp
-        return mediaDict
-
-    def getSqlFields(self, line, numFields):
-        if not line or not '\t' in line:
-            return None, None
-        return line.split('\t', 1)
-
-    def initializeRemoteRepoMediaDict(self):
-        if self.verbose:
-            print "setting up list of media from remote repo"
-        if not self.remoteRepoMediaDict:
-            if reuseRepoDate:
+        for w in self.wikisToDo:
+            if w == self.remoteRepoName:
                 if self.verbose:
-                    print "attempting to reuse previously",
-                    print "generated remote repo media list"
-                try:
-                    self.readRemoteRepoMediaDict(self.reuseRepoDate)
-                except:
-                    pass
-        if not self.remoteRepoMediaDict:
-            if self.verbose:
-                print "reading current list of remote repo media"
-            try:
-                self.readRemoteRepoMediaDict()
-            except:
-                print "failed to read remote repo media list"
-                sys.exit(1)
-
-    def readRemoteRepoMediaDict(self, date=None):
-        remoteRepoMediaDictFd = gzip.open(self.getPath(
-            self.getRemoteRepoFileName(date)), "rb")
-        self.remoteRepoMediaDict = self.getMediaDict(remoteRepoMediaDictFd)
-        remoteRepoMediaDictFd.close()
-
-    def getPath(self, fileName):
-        return(os.path.join(self.outputDir, fileName))
-
-    def getFileName(self, phase, wiki, date):
-        if not date:
-            date = self.date
-        return (self.getFileNameFormat(phase).format(w=wiki, d=date))
-
-    def getremoteMediaRedirsFileName(self, date=None):
-        return self.getFileName("redirs", self.remoteRepoName, date)
-
-    def getRemoteRepoFileName(self, date=None):
-        return self.getFileName("local", self.remoteRepoName, date)
-
-    def getMediaLinksFileName(self, wiki, date=None):
-        return self.getFileName("links", wiki, date)
-
-    def getLocalMediaFileName(self, wiki, date=None):
-        return self.getFileName("local", wiki, date)
-
-    def getRemoteMediaFileName(self, wiki, date=None):
-        return self.getFileName("remote", wiki, date)
-
-    def doAllProjects(self):
-        self.initializeRemoteRepoMediaDict()
-        if wiki:
-            dbList = [wiki]
-        else:
-            dbList = self.wikisToDo
+                    print "Skipping", w, "because it's the remote repo"
+            else:
+                if self.verbose:
+                    print "Doing db", w
+                self.doWikiQueries('select gil_to from globalimagelinks'
+                                   ' where gil_wiki= "%s"' % w,
+                                   self.getFileNameFormat("remote").format(
+                                       w=w, d='{d}'), self.remoteRepoName)
         if self.verbose:
-            print "Starting generation of remote media lists for all wikis"
-        for db in dbList:
-            self.writeRemoteHostedMediaList(db, self.remoteRepoMediaDict)
-        if self.verbose:
-            print "Done with generation of remote media lists for all wikis"
+            print "Done round two!!"
 
 
 def usage(message=None):
@@ -246,7 +94,7 @@
         sys.stderr.write(message + "\n")
 
     usage_message = """Usage: python wmfgetremoteimages.py --outputdir dirname
-                  [--remoterepo reponame] [--reuseremoterepolist]
+                  [--remoterepo reponame] [--localonly] [--remoteonly]
                   [--verbose] [--wqconfig filename] [wqpath filename] [wiki]
 
 This script produces a list of media files in use on the local wiki stored on a
@@ -254,16 +102,16 @@
 
 --outputdir:      where to put the list of remotely hosted media per project
 --remotereponame: name of the remote repo that houses media for projects
-                  default: 'commons'
---nooverwrite:    if run for the same date and wiki(s), dobn't overwrite
+                  default: 'commonswiki'
+--nooverwrite:    if run for the same wiki(s) on the same date, don't overwrite
                   existing files
 --verbose:        print lots of status messages
 --wqconfig:       relative or absolute path of wikiquery config file
                   default: wikiqueries.conf
 --wqpath:         relative or absolute path of the wikiqieries python script
                   default: wikiqueries.py
---sqlonly:        only do the sql queries (first half of run)
---listsonly:      only generate the lists from the sql files (second half
+--localonly:      only generate the lists of local media (first half of run)
+--remoteonly:     only generate the lists of remotely hosted media (second half
                   of run)
 """
     sys.stderr.write(usage_message)
@@ -272,12 +120,11 @@
 
 if __name__ == "__main__":
     outputDir = None
-    remoteRepoName = "commons"
-    reuseRepoDate = None
+    remoteRepoName = "commonswiki"
     verbose = False
     wiki = None
-    sqlOnly = False
-    listsOnly = False
+    remoteOnly = False
+    localOnly = False
     # by default we will overwrite existing files for
     # the same date and wiki(s)
     overwrite = True
@@ -287,7 +134,7 @@
     try:
         (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", [
             "outputdir=", "remotereponame=", "wqconfig=", "wqpath=",
-            "reuseremoterepolist=", "sqlonly", "listsonly",
+            "remoteonly", "localonly",
             "nooverwrite", "verbose"])
     except:
         usage("Unknown option specified")
@@ -297,12 +144,10 @@
             outputDir = val
         elif opt == "--remotereponame":
             remoteRepoName = val
-        elif opt == "--reuseremoterepolist":
-            reuseRepoDate = val
-        elif opt == "--sqlonly":
-            sqlOnly = True
-        elif opt == "--listsonly":
-            listsOnly = True
+        elif opt == "--remoteonly":
+            remoteOnly = True
+        elif opt == "--localonly":
+            localOnly = True
         elif opt == "--nooverwrite":
             overwrite = False
         elif opt == "--verbose":
@@ -312,34 +157,36 @@
             if not os.sep in val:
                 wqConfigFile = os.path.join(os.getcwd(), wqConfigFile)
             # bummer but we can't really avoid ita
-            config = Config(wqConfigFile)
         elif opt == "--wqpath":
             wqPath = val
             if not os.sep in val:
                 wqPath = os.path.join(os.getcwd(), wqPath)
 
-    if len(remainder) > 0:
-        if not remainder.isalpha():
-            usage("Unknown option specified")
+    if len(remainder) == 1:
+        if not remainder[0].isalpha():
+            usage("Unknown argument(s) specified")
         else:
-            wiki = remainder
+            wiki = remainder[0]
+    elif len(remainder) > 1:
+        usage("Unknown argument(s) specified")
 
     if not outputDir:
         usage("One or more mandatory options missing")
-    if listsOnly and sqlOnly:
-        usage("Only one of 'listsonly' and 'sqlonly'"
+    if localOnly and remoteOnly:
+        usage("Only one of 'localonly' and 'remoteonly'"
               " may be specified at once.")
 
-    mpp = MediaPerProject(config, outputDir, remoteRepoName, reuseRepoDate,
-                          verbose, wqConfigFile, wqPath, overwrite)
-    if not listsOnly:
+    config = Config(wqConfigFile)
+
+    mpp = MediaPerProject(config, outputDir, remoteRepoName,
+                          verbose, wqConfigFile, wqPath, overwrite, wiki)
+    if not remoteOnly:
         if verbose:
-            print "generating sql output from all projects"
-        mpp.generateSqlFiles()
-    # we'll need the list of existing remote repo media to compare against
-    if not sqlOnly:
+            print "generating lists of local media on each project"
+        mpp.writeLocalMedia()
+    if not localOnly:
         if verbose:
-            print "generating remote hosted media list for all projects"
-        mpp.doAllProjects()
+            print "generating remote hosted media lists for all projects"
+        mpp.writeRemoteMedia()
     if verbose:
         print "all projects completed."

-- 
To view, visit https://gerrit.wikimedia.org/r/120766
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ifa8c877de7b64a0cb84b0a0f326836e78feca8dd
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to