http://www.mediawiki.org/wiki/Special:Code/MediaWiki/90557
Revision: 90557 Author: halfak Date: 2011-06-21 22:47:39 +0000 (Tue, 21 Jun 2011) Log Message: ----------- added wikimedia utilities Modified Paths: -------------- trunk/tools/wsor/ts_samples/testing.sql Added Paths: ----------- trunk/tools/wsor/wikimedia/ trunk/tools/wsor/wikimedia/setup.py trunk/tools/wsor/wikimedia/wmf/ trunk/tools/wsor/wikimedia/wmf/__init__.py trunk/tools/wsor/wikimedia/wmf/dump/ trunk/tools/wsor/wikimedia/wmf/dump/__init__.py trunk/tools/wsor/wikimedia/wmf/dump/iterator.py trunk/tools/wsor/wikimedia/wmf/dump/map.py trunk/tools/wsor/wikimedia/wmf/dump/tests/ trunk/tools/wsor/wikimedia/wmf/dump/tests/__init__.py trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py trunk/tools/wsor/wikimedia/wmf/util.py Removed Paths: ------------- trunk/tools/wsor/scripts/process_dumps.py Deleted: trunk/tools/wsor/scripts/process_dumps.py =================================================================== --- trunk/tools/wsor/scripts/process_dumps.py 2011-06-21 22:44:54 UTC (rev 90556) +++ trunk/tools/wsor/scripts/process_dumps.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -1,186 +0,0 @@ -import sys, logging, re, types, argparse, os, subprocess -from multiprocessing import Process, Queue, Lock, cpu_count, Value -from Queue import Empty -from gl import wp - -class FileTypeError(Exception):pass - -def encode(v): - if type(v) == types.FloatType: - return str(int(v)) - elif v == None: - return "\\N" - else: - return repr(v) - - - -class SafeOutput: - - def __init__(self, fp): - self.fp = fp - self.l = Lock() - - def push(self, row, encode=encode): - if __debug__: - row = tuple(row) - - with self.l: - self.fp.write("\t".join(clean(v) for v in row) + "\n") - -class Processor(Process): - - def __init__(self, input, processPage, output, callback, logger): - self.input = input - self.processPage = processPage - self.output = output - self.callback = callback - self.logger = logger - Process.__init__(self) - - def run(self): - try: - while True: - foo = self.input.qsize() - fn = self.input.get(block=False) - self.logger.info("Processing dump file %s." % fn) - dump = wp.dump.Iterator(openDumpFile(fn)) - for page in dump.readPages(): - self.logger.debug("Processing page %s:%s." % (page.getId(), page.getTitle())) - try: - for out in self.processPage(dump, page): - self.output.put(out) - except Exception as e: - self.logger.error( - "Failed to process page %s:%s - %s" % ( - page.getId(), - page.getTitle(), - e - ) - ) - - - - - except Empty: - self.logger.info("Nothing left to do. Shutting down thread.") - finally: - self.callback() - - - - -def main(args): - LOGGING_STREAM = sys.stderr - if __debug__: level = logging.DEBUG - else: level = logging.INFO - logging.basicConfig( - level=level, - stream=LOGGING_STREAM, - format='%(name)s: %(asctime)s %(levelname)-8s %(message)s', - datefmt='%b-%d %H:%M:%S' - ) - logging.info("Starting dump processor with %s threads." % min(args.threads, len(args.dump))) - for row in process_dumps(args.dump, args.processor.process, args.threads): - print('\t'.join(encode(v) for v in row)) - -def process_dumps(dumps, processPage, threads): - input = dumpFiles(dumps) - output = Queue(maxsize=10000) - running = Value('i', 0) - - def dec(): running.value -= 1 - - for i in range(0, min(threads, input.qsize())): - running.value += 1 - Processor( - input, - processPage, - output, - dec, - logging.getLogger("Process %s" % i) - ).start() - - - #output while processes are running - while running.value > 0: - try: yield output.get(timeout=.25) - except Empty: pass - - #finish yielding output buffer - try: - while True: yield output.get(block=False) - except Empty: - pass - - - -EXTENSIONS = { - 'xml': "cat", - 'bz2': "bzcat", - '7z': "7z e -so 2>/dev/null", - 'lzma':"lzcat" -} - -EXT_RE = re.compile(r'\.([^\.]+)$') -def dumpFile(path): - path = os.path.expanduser(path) - if not os.path.isfile(path): - raise FileTypeError("Can't find file %s" % path) - - match = EXT_RE.search(path) - if match == None: - raise FileTypeError("No extension found for %s." % path) - elif match.groups()[0] not in EXTENSIONS: - raise FileTypeError("File type %r is not supported." % path) - else: - return path - -def dumpFiles(paths): - q = Queue() - for path in paths: q.put(dumpFile(path)) - return q - -def openDumpFile(path): - match = EXT_RE.search(path) - ext = match.groups()[0] - p = subprocess.Popen( - "%s %s" % (EXTENSIONS[ext], path), - shell=True, - stdout=subprocess.PIPE - ) - return p.stdout - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Maps a function across pages of MediaWiki dump files' - ) - parser.add_argument( - '-o', '--out', - metavar="<path>", - type=lambda path:open(path, "w"), - help='the path to an output file to write putput to (defaults to stdout)', - default=sys.stdout - ) - parser.add_argument( - '-t', '--threads', - metavar="", - type=int, - help='the number of threads to start (defaults to # of cores -1)', - default=cpu_count()-1 - ) - parser.add_argument( - 'processor', - type=__import__, - help='the class path to the function to use to process each page' - ) - parser.add_argument( - 'dump', - type=dumpFile, - help='the XML dump file(s) to process', - nargs="+" - ) - args = parser.parse_args() - main(args) - Modified: trunk/tools/wsor/ts_samples/testing.sql =================================================================== --- trunk/tools/wsor/ts_samples/testing.sql 2011-06-21 22:44:54 UTC (rev 90556) +++ trunk/tools/wsor/ts_samples/testing.sql 2011-06-21 22:47:39 UTC (rev 90557) @@ -40,3 +40,30 @@ CREATE UNIQUE INDEX user_id_idx ON halfak.user_meta (user_id); CREATE INDEX first_edit_idx ON halfak.user_meta (first_edit); CREATE INDEX last_edit_idx ON halfak.user_meta (last_edit); + + +SELECT + year, + biannual, + count(*) +FROM +( +SELECT + u.user_id, + SUBSTRING(first_edit, 1,4) as year, + SUBSTRING(first_edit, 5,2) >= "07" as biannual +FROM halfak.user_meta um +INNER JOIN user u + ON u.user_id = um.user_id +INNER JOIN page p + ON p.page_title = u.user_name + AND p.page_namespace = 3 +INNER JOIN revision r + ON um.user_id != r.rev_user + AND p.page_id = r.rev_page +GROUP BY + user_id, + SUBSTRING(first_edit, 1,4), + SUBSTRING(first_edit, 5,2) +) as foo +GROUP BY year, biannual; Added: trunk/tools/wsor/wikimedia/setup.py =================================================================== --- trunk/tools/wsor/wikimedia/setup.py (rev 0) +++ trunk/tools/wsor/wikimedia/setup.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,27 @@ + +from setuptools import setup, find_packages + +setup( + name='util', + version='1.0', + description="WMF utilities", + long_description=""" + A set of utilities originally authored by Aaron Halfaker + during the 2011 Wikimedia Summer of Research. The utilities + in this package are intended to aid in processing of + MediaWiki data related to Wikimedia projects. Many of the + utilities have been specifically designed to allow + processing of the massive about of data (currently) found + in the full history dump of the English Wikipedia + """ + author='Aaron Halfaker', + author_email='aaron.halfa...@gmail.com', + url='http://meta.wikimedia.org/wiki/User:EpochFail', + packages=find_packages(), + entry_points = { + 'distutils.commands': [ + 'dump_map = util.dump.map:main', + ] + }, + +) Added: trunk/tools/wsor/wikimedia/wmf/__init__.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/__init__.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/__init__.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,2 @@ +from __future__ import absolute_import +from .util import * Added: trunk/tools/wsor/wikimedia/wmf/dump/__init__.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/__init__.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/__init__.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,2 @@ +from .iterator import Iterator +from .map import map Added: trunk/tools/wsor/wikimedia/wmf/dump/iterator.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/iterator.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/iterator.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,220 @@ +from xml_iterator import XMLIterator +from ..util import wp2Timestamp + +def cleanTag(prefix, raw): + return raw[len(prefix):] + + +class Iterator: + """ + WikiFile dump processor. This class constructs with a filepointer to a + Wikipedia XML dump file. + + """ + + def __init__(self, fp): + """ + Constructor + + :Parameters: + fp : file pointer + a file pointer to the xml file to process. + """ + + self.fp = fp #:The file pointer passed to the constructor + self.namespaces = {} #:A map of possible namespaces + self.siteName = None #:The name of the site + self.base = None #:Base of the xml file + self.generator = None #:Generator of the dump + self.case = None #:The default title case + + self.mediawikiElement = XMLIterator(fp) + self.ns = self.mediawikiElement.tag[:-len('mediawiki')] + + pageCount = 0 + done = False + for element in self.mediawikiElement: + tag = cleanTag(self.ns, element.tag) + if tag == "siteinfo": + self.loadSiteInfo(element) + element.clear() + break + + + + def loadSiteInfo(self, siteInfoElement): + for element in siteInfoElement: + tag = cleanTag(self.ns, element.tag) + + if tag == 'sitename': + self.siteName = element.text + elif tag == 'base': + self.base = element.text + elif tag == 'generator': + self.generator = element.text + elif tag == 'case': + self.case = element.text + elif tag == 'namespaces': + self.loadNamespaces(element) + element.clear() + + + + def loadNamespaces(self, namespacesElement): + for element in namespacesElement: + tag = cleanTag(self.ns, element.tag) + + if tag == "namespace": + namespace = Namespace(element) + self.namespaces[namespace.getName()] = namespace.getId() + else: + assert False, "This should never happen" + + + def readPages(self): + for element in self.mediawikiElement: + tag = cleanTag(self.ns, element.tag) + if tag == "page": + yield Page(self.ns, element) + + + + +class Namespace: + + def __init__(self, nsElement): + self.setId(nsElement.get('key')) + self.setName(nsElement.text) + + def setId(self, id): self.id = int(id) + def getId(self): return self.id + + def setName(self, name): + if name == None: + self.name = None + else: + self.name = unicode(name) + def getName(self): return self.name + + def __repr__(self): + return "%s(%r, %r)" % ( + self.__class__.__name__, + self.getId(), + self.getName() + ) + + def __eq__(self, other): + try: + return ( + self.getId() == other.getId() and + self.getName() == other.getName() + ) + except AttributeError: + return False + +class Page: + + def __init__(self, ns, pageElement): + self.id = None + self.title = None + self.pageElement = pageElement + self.ns = ns + for element in pageElement: + tag = cleanTag(ns, element.tag) + if tag == "id": + self.setId(element.text) + elif tag == "title": + self.setTitle(element.text) + + if self.id != None and self.title != None: + break + + def readRevisions(self): + for element in self.pageElement: + tag = cleanTag(self.ns, element.tag) + if tag == "revision": + yield Revision(self.ns, element) + #element.clear() + + + + def setId(self, id): self.id = int(id) + def getId(self): return self.id + + def setTitle(self, title): self.title = unicode(title) + def getTitle(self): return self.title + + + +class Revision: + + TAG_MAP = { + 'id': lambda s,e:s.setId(e.text), + 'timestamp': lambda s,e:s.setTimestamp(e.text), + 'contributor': lambda s,e:s.setContributor(e), + 'minor': lambda s,e:s.setMinor(True), + 'comment': lambda s,e:s.setComment(e.text), + 'text': lambda s,e:s.setText(e.text) + } + + def __init__(self, ns, revisionElement): + self.ns = ns + self.id = None + self.timestamp = None + self.contributor = None + self.minor = False #No tag means minor edit + self.comment = None + self.text = None + for element in revisionElement: + tag = cleanTag(ns, element.tag) + self.TAG_MAP[tag](self, element) + + def setId(self, id): self.id = int(id) + def getId(self): return self.id + + def setTimestamp(self, timestamp): + try: self.timestamp = int(timestamp) + except ValueError: self.timestamp = wp2Timestamp(timestamp) + def getTimestamp(self): return self.timestamp + + def setContributor(self, element): + if element.get("deleted", None) == "deleted": + self.contributor = None + else: + self.contributor = Contributor(self.ns, element) + + def getContributor(self): return self.contributor + + def setMinor(self, minor): self.minor = minor == True + def getMinor(self): return self.minor + + def setComment(self, comment): self.comment = unicode(comment) + def getComment(self): return self.comment + + def setText(self, text): + if text == None: self.text = u'' + else: self.text = unicode(text) + def getText(self): return self.text + +class Contributor: + + TAG_MAP = { + 'id': lambda s,e:s.setId(e.text), + 'username': lambda s,e:s.setUsername(e.text), + 'ip': lambda s,e:s.setUsername(e.text) + } + + def __init__(self, ns, contributorElement): + self.id = None + for element in contributorElement: + tag = cleanTag(ns, element.tag) + self.TAG_MAP[tag](self, element) + + def setId(self, id): self.id = int(id) + def getId(self): return self.id + + def setUsername(self, username): self.username = unicode(username) + def getUsername(self): return self.username + + + Added: trunk/tools/wsor/wikimedia/wmf/dump/map.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/map.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/map.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,255 @@ +""" +Dump Mapper + +This script acts as a map/function over the pages in a set of MediaWiki +database dump files. This script allows the algorithm for processing a set of +pages to be spread across the available processor cores of a system for faster +analysis. + +This script can also be imported as a module to expose the `dump_map()` function +that returns an iterator over output rather than printing to stdout. + +Examples: + +python -O process_dumps.py revision_meta /dumps/enwiki-20110115-pages-meta-history* > ~/data/revision_meta.tsv +""" +import sys, logging, re, types, argparse, os, subprocess +from multiprocessing import Process, Queue, Lock, cpu_count, Value +from Queue import Empty + +from .iterator import Iterator + +class FileTypeError(Exception):pass + +class Processor(Process): + """ + A processor for managing the reading of dump files from a queue and + the application of a a function for each 'page'. + """ + + def __init__(self, input, processPage, output, callback, logger): + """ + Constructor + + :Parameters: + input : `multiprocessing.Queue` + a queue paths to dump files to process + processPage : function + a function to apply to each page of a dump file + output : `multiprocessing.Queue` + a queue to send processing output to + callback : function + a function to run upon completion + logger : `logging.Logger` + a logger object to send logging events to + """ + self.input = input + self.processPage = processPage + self.output = output + self.callback = callback + self.logger = logger + Process.__init__(self) + + def run(self): + try: + while True: + foo = self.input.qsize() + fn = self.input.get(block=False) + self.logger.info("Processing dump file %s." % fn) + dump = Iterator(openDumpFile(fn)) + for page in dump.readPages(): + self.logger.debug("Processing page %s:%s." % (page.getId(), page.getTitle())) + try: + for out in self.processPage(dump, page): + self.output.put(out) + except Exception as e: + self.logger.error( + "Failed to process page %s:%s - %s" % ( + page.getId(), + page.getTitle(), + e + ) + ) + + + + + except Empty: + self.logger.info("Nothing left to do. Shutting down thread.") + finally: + self.callback() + + +def map(dumps, processPage, threads=cpu_count()-1): + """ + Maps a function across all of the pages in a set of dump files and returns + an (order not guaranteed) iterator over the output. + + :Parameters: + dumps : list + a list of paths to dump files to process + processPage : function + a function to run on every page of a set of dump files. + threads : int + the number of individual processing threads to spool up + """ + + input = dumpFiles(dumps) + output = Queue(maxsize=10000) + running = Value('i', 0) + + def dec(): running.value -= 1 + + for i in range(0, min(threads, input.qsize())): + running.value += 1 + Processor( + input, + processPage, + output, + dec, + logging.getLogger("Process %s" % i) + ).start() + + + #output while processes are running + while running.value > 0: + try: yield output.get(timeout=.25) + except Empty: pass + + #finish yielding output buffer + try: + while True: yield output.get(block=False) + except Empty: + pass + + + +EXTENSIONS = { + 'xml': "cat", + 'bz2': "bzcat", + '7z': "7z e -so 2>/dev/null", + 'lzma':"lzcat" +} +""" +A map from file extension to the command to run to extract the data to standard out. +""" + +EXT_RE = re.compile(r'\.([^\.]+)$') +""" +A regular expression for extracting the final extension of a file. +""" + + +def dumpFile(path): + """ + Verifies that a file exists at a given path and that the file has a + known extension type. + + :Parameters: + path : `str` + the path to a dump file + + """ + path = os.path.expanduser(path) + if not os.path.isfile(path): + raise FileTypeError("Can't find file %s" % path) + + match = EXT_RE.search(path) + if match == None: + raise FileTypeError("No extension found for %s." % path) + elif match.groups()[0] not in EXTENSIONS: + raise FileTypeError("File type %r is not supported." % path) + else: + return path + +def dumpFiles(paths): + """ + Produces a `multiprocessing.Queue` containing path for each value in + `paths` to be used by the `Processor`s. + + :Parameters: + paths : iterable + the paths to add to the processing queue + """ + q = Queue() + for path in paths: q.put(dumpFile(path)) + return q + +def openDumpFile(path): + """ + Turns a path to a dump file into a file-like object of (decompressed) + XML data. + + :Parameters: + path : `str` + the path to the dump file to read + """ + match = EXT_RE.search(path) + ext = match.groups()[0] + p = subprocess.Popen( + "%s %s" % (EXTENSIONS[ext], path), + shell=True, + stdout=subprocess.PIPE + ) + return p.stdout + + +def encode(v): + """ + Encodes an output value as a string intended to be read by eval() + """ + if type(v) == types.FloatType: + return str(int(v)) + elif v == None: + return "\\N" + else: + return repr(v) + + + +def main(): + parser = argparse.ArgumentParser( + description='Maps a function across pages of MediaWiki dump files' + ) + parser.add_argument( + '-o', '--out', + metavar="<path>", + type=lambda path:open(path, "w"), + help='the path to an output file to write putput to (defaults to stdout)', + default=sys.stdout + ) + parser.add_argument( + '-t', '--threads', + metavar="", + type=int, + help='the number of threads to start (defaults to # of cores -1)', + default=cpu_count()-1 + ) + parser.add_argument( + 'processor', + type=__import__, + help='the class path to the module that contains the process() function be passed each page' + ) + parser.add_argument( + 'dump', + type=dumpFile, + help='the XML dump file(s) to process', + nargs="+" + ) + args = parser.parse_args() + + LOGGING_STREAM = sys.stderr + if __debug__: level = logging.DEBUG + else: level = logging.INFO + logging.basicConfig( + level=level, + stream=LOGGING_STREAM, + format='%(name)s: %(asctime)s %(levelname)-8s %(message)s', + datefmt='%b-%d %H:%M:%S' + ) + logging.info("Starting dump processor with %s threads." % min(args.threads, len(args.dump))) + for row in dump_map(args.dump, args.processor.process, args.threads): + print('\t'.join(encode(v) for v in row)) + +if __name__ == "__main__": + main() Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/__init__.py =================================================================== Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,28 @@ +import os, subprocess + +def extractFile(fileName): + decompressCall = "lzma -c -q -d %s" % fileName + process = subprocess.Popen( + decompressCall, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True + ) + return process.stdout + +def getSmallXMLFilePath(): + pwd = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(pwd, "small.xml.lzma") + + +def getLargeXMLFilePath(): + pwd = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(pwd, "large.xml.lzma") + + +def getSmallXMLFilePointer(): + return extractFile(getSmallXMLFilePath()) + +def getLargeXMLFilePointer(): + return extractFile(getLargeXMLFilePath()) + \ No newline at end of file Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma =================================================================== (Binary files differ) Property changes on: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma =================================================================== (Binary files differ) Property changes on: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,4 @@ +import os +print(__file__) +print(os.path.realpath(__file__)) +print(os.path.realpath(__file__)[:-1*len(__file__)]) Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1 @@ + Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,81 @@ +import sys, logging +from nose.tools import eq_ +from . import sample +from ..iterator import Iterator, Namespace +import util + +logging.basicConfig(level=logging.INFO) + +def test_small(): + fp = sample.getSmallXMLFilePointer() + wf = Iterator(fp) + for key in [ + -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, 15, + 100,101,108,109 + ]: + assert key in wf.namespaces.values(), "Key %s not found in %s" % (key, wf.namespaces) + + for page in wf.readPages(): + eq_( + page.getTitle(), + u'Talk:Pilsbury Block' + ) + for revision in page.readRevisions(): + eq_( + revision.getId(), + 213377884 + ) + eq_( + revision.getTimestamp(), + util.wp2Timestamp("2008-05-19T01:41:53Z") + ) + eq_( + revision.getContributor().getId(), + 905763 + ) + eq_( + revision.getContributor().getUsername(), + u"Swampyank" + ) + eq_( + revision.getMinor(), + False + ) + eq_( + revision.getComment(), + u"[[WP:AES|\u2190]]Created page with '{{WikiProject National Register of Historic Places|class=Stub}} {{WikiProject Maine|class=Stub|importance=Low}} {{reqphoto|in=Maine}}'" + ) + + eq_( + revision.getText(), + u"{{WikiProject National Register of Historic Places|class=Stub}}\n" + + u"{{WikiProject Maine|class=Stub|importance=Low}}\n" + + u"{{reqphoto|in=Maine}}" + ) + + + +def test_large(): + fp = sample.getLargeXMLFilePointer() + wf = Iterator(fp) + pageCounter = 0 + revisionCounter = 0 + for page in wf.readPages(): + pageCounter += 1 + for revision in page.readRevisions(): + assert revision.getId() != None + assert revision.getTimestamp() != None + __ = revision.getContributor() + __ = revision.getComment() + assert revision.getMinor() != None + assert revision.getText() != None + #sys.stderr.write(".") + revisionCounter += 1 + if revisionCounter >= 100: break + + + eq_(pageCounter, 1) + #eq_(revisionCounter, 15180) + eq_(revisionCounter, 100) + Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,25 @@ +import sys, logging +from nose.tools import eq_ +from gl import wp +from . import sample +from ..map import map + + +def test_simple_map(): + dumps = [sample.getSmallXMLFilePath(), sample.getLargeXMLFilePath()] + + def processPage(dump, page): + assert hasattr(dump, "namespaces") + assert hasattr(page, "readRevisions") + + count = 0 + for rev in page.readRevisions(): + count += 1 + if count >= 100: break + + yield (page.getId(), count) + + output = dict(map(dumps, processPage)) + + eq_(output[17500012], 1) + eq_(output[12], 100) Added: trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,76 @@ +try: + import xml.etree.cElementTree as etree +except ImportError: + import xml.etree.ElementTree as etree + +def XMLIterator(fp): + xmlIterator = etree.iterparse(fp, events=("start","end")) + return ElementIterator(xmlIterator.next()[1], xmlIterator) + +class ElementIteratorError: pass + +class ElementIterator: + + def __init__(self, element, xmlIterator): + self.element = element + self.xmlIterator = xmlIterator + self.tagStack = [self.element.tag] + + def __iter__(self): + if len(self.tagStack) == 0: + raise ElementIteratorError("Element has already been iterated through.") + + for event, element in self.xmlIterator: + if event == "start": + element = ElementIterator(element, self.xmlIterator) + yield element + element.clear() + + else: #event == "end" + assert element.tag == self.element.tag, "Expected %r, got %r" % (self.element.tag, element.tag) + self.tagStack.pop() + + if len(self.tagStack) == 0: + break + + + def get(self, key, alt=None): + return self.element.attrib.get(key, alt) + + + def complete(self): + if len(self.tagStack) != 0: + for event, element in self.xmlIterator: + if event == "start": + self.tagStack.append(element.tag) + element.clear() + + else: #event == "end" + assert self.tagStack[-1] == element.tag, "Expected %r at the end of %r" % (element.tag, self.tagStack) + self.tagStack.pop() + + if len(self.tagStack) == 0: + break + + + def clear(self): + self.complete() + self.element.clear() + + + def __del__(self): + self.clear() + + def __getattr__(self, attr): + if attr == "attrib": + return self.element.attrib + elif attr == "tag": + return self.element.tag + elif attr == "tail": + return self.element.tail + elif attr == "text": + self.complete() + return self.element.text + else: + raise AttributeError("%s has no attribute %r" % (self.__class__.__name__, attr)) + Added: trunk/tools/wsor/wikimedia/wmf/util.py =================================================================== --- trunk/tools/wsor/wikimedia/wmf/util.py (rev 0) +++ trunk/tools/wsor/wikimedia/wmf/util.py 2011-06-21 22:47:39 UTC (rev 90557) @@ -0,0 +1,236 @@ +from __future__ import with_statement, absolute_import +import re, types +import time, calendar, datetime +import hashlib +import urllib + +__docformat__ = "restructuredtext en" + +""" +This module contains utility functions for interacting with Wikipedia. +""" + +LONG_WP_TIME_STRING = '%Y-%m-%dT%H:%M:%SZ' +""" +The longhand version of Wikipedia timestamps. +""" + +SHORT_WP_TIME_STRING = '%Y%m%d%H%M%S' +""" +The shorthand version of Wikipedia timestamps +""" + +WPAPI_URL = "http://%s.wikipedia.org/w/api.php" +""" +The wikipedia API URL. A positional format token is included to so that the +language specific prefix can be formatted in. See `wpAPIURL()`. +""" + + +VLOOSE_RE = re.compile(r''' + (^revert\ to.+using) + | (^reverted\ edits\ by.+using) + | (^reverted\ edits\ by.+to\ last\ version\ by) + | (^bot\ -\ rv.+to\ last\ version\ by) + | (-assisted\ reversion) + | (^(revert(ed)?|rv).+to\ last) + | (^undo\ revision.+by) + ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) + +VSTRICT_RE = re.compile(r''' + (\brvv) + | (\brv[/ ]v) + | (vandal(?!proof|bot)) + | (\b(rv|rev(ert)?|rm)\b.*(blank|spam|nonsense|porn|mass\sdelet|vand)) + ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) + +NAMESPACES = { + 'en': set([ + 'Media', + 'Special', + 'Talk', + 'User talk', + 'Wikipedia talk', + 'Image talk', + 'MediaWiki talk', + 'Template talk', + 'Help talk', + 'Category talk', + 'Portal talk', + 'File talk', + 'User', + 'Wikipedia', + 'Image', + 'MediaWiki', + 'Template', + 'Help', + 'Category', + 'Portal', + 'File' + ]) +} + +NAMESPACE_RE = re.compile(r'^((?:%s)):' % ')|(?:'.join(NAMESPACES['en']), + re.IGNORECASE) + +def wpAPIURL(prefix="en"): + """ + Creates a the URL for the wikipedia API based on a language prefix. + + :Parameters: + prefix : string + the prefix to be formatted into the url + + :Return: + the Wikipedia API url for a given language prefix + """ + return WPAPI_URL % prefix + + +def wp2Timestamp(wpTime): + """ + Converts a Wikipedia timestamp to a Unix Epoch-based timestamp (seconds + since Jan. 1st 1970 GMT). This function will handle both long + (see `LONG_WP_TIME_STRING`) and short (see `SHORT_WP_TIME_STRING`) + time formats. + + :Parameters: + wpTime : string + Wikipedia timestamp to be converted + + :Return: + integer Unix Epoch-based timestamp (seconds since Jan. 1st 1970 + GMT) version of the provided wpTime. + """ + try: + myTime = time.strptime(wpTime, LONG_WP_TIME_STRING) + except ValueError as e: + try: + myTime = time.strptime(wpTime, SHORT_WP_TIME_STRING) + except ValueError as e: + raise ValueError("'%s' is not a valid Wikipedia date format" % wpTime) + + return int(calendar.timegm(myTime)) + +def timestamp2WP(timestamp): + """ + Converts a Unix Epoch-based timestamp (seconds since Jan. 1st 1970 GMT) + timestamp to one acceptable by Wikipedia. + + :Parameters: + timestamp : int + Unix timestamp to be converted + + :Return: + string Wikipedia style timestamp + """ + + return datetime.datetime.utcfromtimestamp(timestamp).strftime('%Y%m%d%H%M%S') + +def digest(content): + return hashlib.md5(content.encode("utf-8")).hexdigest() + + +def normalize(name): + """ + Normalizes text from a Wikipedia title/segment by capitalizing the + first letter, replacing underscores with spaces, and collapsing all + spaces to one space. + + :Parameters: + name : string + Namespace or title portion of a Wikipedia page name. + + :Return: + string Normalized text + """ + + return name.capitalize().replace("_", " ").strip() + +def normalizeTitle(title, namespaces=NAMESPACES['en']): + """ + Normalizes a Wikipedia page title and splits the title into + namespace and title pieces. + + :Parameters: + title : string + The title of a Wikipedia page. + namespaces : set + A set of namespaces to look for in the title. + + :Return: + The namespace, title tuple + """ + + if type(title) == types.UnicodeType: + title = title.encode('utf-8') + + title = title.strip() + parts = title.split(":", 1) + if len(parts) == 1: + namespace = None + title = normalize(parts[0]) + elif parts[1] == '': + namespace = None + title = normalize(title) + else: + nsPart = normalize(parts[0]) + if nsPart in namespaces: + namespace = nsPart + title = normalize(parts[1]) + else: + namespace = None + title = normalize(title) + + return (namespace, title) + +def normalizeURLTitle(title, namespaces=NAMESPACES['en']): + """ + Normalizes a Wikipedia page title obtained from a URL and splits + the title into namespace and title pieces. + + :Parameters: + title : string + The title of a Wikipedia page. + namespaces : set + A set of namespaces to look for in the title. + + :Return: + The namespace, title tuple + """ + + if type(title) == types.UnicodeType: + title = title.encode('utf-8') + title = urllib.unquote(title).split('#')[0] + ns = NAMESPACE_RE.match(title) + if not ns: + namespace = "" + title = normalize(title) + else: + nsPart = ns.group(1).capitalize() + if nsPart in namespaces: + namespace = nsPart + title = normalize(title[ns.end():]) + return (namespace, title) + +def isVandalismByComment(editComment, testLoose=True, testStrict=True): + ''' + Check the given edit comment against the VLOOSE and VSTRICT regexes + as configured, and returns a boolean defining if it matches or not. + + @param editComment: The edit comment to test. + @type editComment: str + + @param testLoose: If the edit comment matches VLOOSE_RE, True is returned + @type testLoose: bool + + @param testStrict: If the edit comment matches VSTRICT_RE, True is returned + @type testStrict: bool + ''' + + if testLoose and VLOOSE_RE.search(editComment): + return True; + if testStrict and VSTRICT_RE.search(editComment): + return True; + + return False; _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs