https://www.mediawiki.org/wiki/Special:Code/MediaWiki/107076
Revision: 107076 Author: halfak Date: 2011-12-22 17:13:32 +0000 (Thu, 22 Dec 2011) Log Message: ----------- message_postings.py scrip working and documented Modified Paths: -------------- trunk/tools/wsor/message_templates/message_postings.py Modified: trunk/tools/wsor/message_templates/message_postings.py =================================================================== --- trunk/tools/wsor/message_templates/message_postings.py 2011-12-22 16:41:23 UTC (rev 107075) +++ trunk/tools/wsor/message_templates/message_postings.py 2011-12-22 17:13:32 UTC (rev 107076) @@ -1,7 +1,12 @@ -import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time -import urllib, urllib2 +import sys, argparse, os +import logging, types, re +import time, datetime +import MySQLdb, MySQLdb.cursors +import urllib, urllib2, json, htmlentitydefs import wmf +class MissingRevError(Exception):pass + def encode(v): if v == None: return "\N" @@ -10,18 +15,46 @@ return str(v).encode("string-escape") -# | year | month | day | hour | minute | second | -MW_DATE = re.compile(r"[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]") +def emit(rev): + + print( + "\t".join( + encode(rev[c]) for c in [ + 'rev_id', + 'rev_timestamp', + 'poster_id', + 'poster_name', + 'recipient_name', + 'message_match' + ] + ) + ) + +# MediaWiki Date format +# +# | year | month | day | hour | minute | second | +MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$") def mwDate(string): if MW_DATE.match(string) == None: - raise ValueError("%s is not a valid date. Expected YYMMDDHHmmSS" % string) + raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string) else: return string def main(): parser = argparse.ArgumentParser( - description='Gathers template message postings based on comment and diff matching regular expressions.' + description=""" + Gathers experimental message postings from user_talk messages. + """, + epilog=""" + python message_postings.py + -h db42 + --start=20111222000000 + --end=20111223000000 + --comment="\(\[\[WP:HG\|HG\]\]\)" + --message="Template:uw-vandalism1" + """, + conflict_handler="resolve" ) parser.add_argument( '-c', '--cnf', @@ -31,7 +64,7 @@ default=os.path.expanduser("~/.my.cnf") ) parser.add_argument( - '-s', '--host', + '-h', '--host', type=str, help='the database host to connect to (defaults to localhost)', default="localhost" @@ -45,29 +78,36 @@ parser.add_argument( '-a', '--api_uri', type=str, - help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', default="http://en.wikipedia.org/w/api.php" ) parser.add_argument( - '--before', - type=str, - help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', - default="http://en.wikipedia.org/w/api.php" + '--start', + type=mwDate, + help='the start of the experimental period. (Required)', + required=True ) parser.add_argument( - 'after', - type=mwDate, - help='regular expression to match against message content' + '--end', + type=mwDate, + help='the end of the experimental period. (defaults to NOW())', + default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") ) parser.add_argument( - 'comment', + '--user_name', + type=str, + help='the user_name to further filter postings by (useful for tracking bots)' + ) + parser.add_argument( + '--comment', type=re.compile, help='regular expression to match against message posting comment' ) parser.add_argument( - 'message', + '--message', type=re.compile, - help='regular expression to match against message content' + help='regular expression to match against message content (required)', + required=True ) args = parser.parse_args() @@ -78,6 +118,8 @@ format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S' ) + logging.debug("Comment pattern is %r." % args.comment.pattern) + logging.debug("Message pattern is %r." % args.message.pattern) logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) db = Database( @@ -85,6 +127,27 @@ db=args.db, read_default_file=args.cnf ) + + logging.info("Connecting to API @ %s." % args.api_uri) + api = WPAPI(args.api_uri) + + logging.info("Querying for matching revisions:") + count = {"matched": 0, "missed": 0} + for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): + message = api.getAdded(rev['rev_id']) + match = args.message.search(message) + if match != None: + rev['message_match'] = match.group(0) + + emit(rev) + LOGGING_STREAM.write("|") + count['matched'] += 1 + else: + LOGGING_STREAM.write("o") + count['missed'] += 1 + + LOGGING_STREAM.write("\n") + logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count) @@ -95,18 +158,36 @@ self.kwargs = kwargs self.conn = MySQLdb.connect(*args, **kwargs) - def getPostings(self, afterDate, commentPattern): + def getPostings(self, start, end, userName=None, commentRE=None): + if (userName, commentRE) == (None, None): + raise TypeError("Must specify at at least one of userName or commentRE.") + cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) + query = """ + SELECT + r.rev_id, + r.rev_timestamp, + r.rev_comment, + r.rev_user AS poster_id, + r.rev_user_text AS poster_name, + REPLACE(p.page_title, "_", " ") AS recipient_name + FROM revision r + INNER JOIN page p ON r.rev_page = p.page_id + WHERE rev_timestamp BETWEEN %(start)s AND %(end)s + AND page_namespace = 3 + """ + if userName != None: + query += "AND rev_user_text = %(user_name)s\n" + if commentRE != None: + query += "AND rev_comment REGEXP %(comment_pattern)s\n" + cursor.execute( - """ - SELECT * FROM - FROM revision - WHERE rev_timestamp > %(afterDate)s - AND rev_comment REGEXP %(commentPattern)s - """, + query, { - 'afterDate': afterDate, - 'commentPattern': commentPattern + 'start': start, + 'end': end, + 'user_name': userName, + 'comment_pattern': commentRE.pattern } ) @@ -116,33 +197,65 @@ class WPAPI: + DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') def __init__(self, uri): self.uri = uri - def getDiff(self, revId): + def getDiff(self, revId, retries=10): + attempt = 0 + while attempt < retries: + try: + response = urllib2.urlopen( + self.uri, + urllib.urlencode({ + 'action': 'query', + 'prop': 'revisions', + 'revids': revId, + 'rvprop': 'ids', + 'rvdiffto': 'prev', + 'format': 'json' + }) + ) + result = json.load(response) + return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] + except urllib2.HTTPError as e: + time.sleep(attempt*2) + attempt += 1 + + + + def getAdded(self, revId): + diff = self.getDiff(revId) - response = urllib2.urlopen( - self.uri, - data=urllib.urlencode({ - 'action': "query", - 'prop': "revisions", - 'revids': revId, - 'rvprop': "diff", - 'format': "json" - }) + return self.unescape( + "\n".join( + match.group(1) + for match in WPAPI.DIFF_ADD_RE.finditer(diff) + ) ) - js = json.load(response) - - - - try: - if 'badrevids' in js['query']: - raise KeyError(revId) + def unescape(self, text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass else: - return js['query']['pages'].values()[0]['revisions'][0]['diff']['*'] - except KeyError: - + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) +if __name__ == "__main__": + main() _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs