https://www.mediawiki.org/wiki/Special:Code/MediaWiki/107076

Revision: 107076
Author:   halfak
Date:     2011-12-22 17:13:32 +0000 (Thu, 22 Dec 2011)
Log Message:
-----------
message_postings.py scrip working and documented

Modified Paths:
--------------
    trunk/tools/wsor/message_templates/message_postings.py

Modified: trunk/tools/wsor/message_templates/message_postings.py
===================================================================
--- trunk/tools/wsor/message_templates/message_postings.py      2011-12-22 
16:41:23 UTC (rev 107075)
+++ trunk/tools/wsor/message_templates/message_postings.py      2011-12-22 
17:13:32 UTC (rev 107076)
@@ -1,7 +1,12 @@
-import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
-import urllib, urllib2
+import sys, argparse, os
+import logging, types, re
+import time, datetime
+import MySQLdb, MySQLdb.cursors
+import urllib, urllib2, json, htmlentitydefs
 import wmf
 
+class MissingRevError(Exception):pass
+
 def encode(v):
        if v == None: return "\N"
        
@@ -10,18 +15,46 @@
        
        return str(v).encode("string-escape")
 
-#                     |  year |   month |     day |    hour |  minute |  
second |
-MW_DATE = 
re.compile(r"[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]")
+def emit(rev):
+       
+       print(
+               "\t".join(
+                       encode(rev[c]) for c in [
+                               'rev_id',
+                               'rev_timestamp',
+                               'poster_id',
+                               'poster_name',
+                               'recipient_name',
+                               'message_match'
+                       ]
+               )
+       )
 
+
+#  MediaWiki Date format
+#
+#                      |  year |   month |     day |    hour |  minute |  
second |
+MW_DATE = 
re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
 def mwDate(string):
        if MW_DATE.match(string) == None:
-               raise ValueError("%s is not a valid date.  Expected 
YYMMDDHHmmSS" % string)
+               raise ValueError("%r is not a valid date.  Expected 
YYMMDDHHmmSS" % string)
        else:
                return string
 
 def main():
        parser = argparse.ArgumentParser(
-               description='Gathers template message postings based on comment 
and diff matching regular expressions.'
+               description="""
+               Gathers experimental message postings from user_talk messages.
+               """,
+               epilog="""
+               python message_postings.py 
+               -h db42 
+               --start=20111222000000 
+               --end=20111223000000 
+               --comment="\(\[\[WP:HG\|HG\]\]\)" 
+               --message="Template:uw-vandalism1"
+               """,
+               conflict_handler="resolve"
        )
        parser.add_argument(
                '-c', '--cnf',
@@ -31,7 +64,7 @@
                default=os.path.expanduser("~/.my.cnf")
        )
        parser.add_argument(
-               '-s', '--host',
+               '-h', '--host',
                type=str, 
                help='the database host to connect to (defaults to localhost)',
                default="localhost"
@@ -45,29 +78,36 @@
        parser.add_argument(
                '-a', '--api_uri',
                type=str, 
-               help='the default Wikimedia API to connect to in order to 
retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
+               help='the mediawiki API to connect to in order to retrieve 
message content (defaults to http://en.wikipedia.org/w/api.php)',
                default="http://en.wikipedia.org/w/api.php";
        )
        parser.add_argument(
-               '--before',
-               type=str, 
-               help='the default Wikimedia API to connect to in order to 
retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
-               default="http://en.wikipedia.org/w/api.php";
+               '--start',
+               type=mwDate,
+               help='the start of the experimental period. (Required)',
+               required=True
        )
        parser.add_argument(
-               'after',
-               type=mwDate,
-               help='regular expression to match against message content'
+               '--end',
+               type=mwDate, 
+               help='the end of the experimental period.  (defaults to NOW())',
+               default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
        )
        parser.add_argument(
-               'comment',
+               '--user_name',
+               type=str, 
+               help='the user_name to further filter postings by (useful for 
tracking bots)'
+       )
+       parser.add_argument(
+               '--comment',
                type=re.compile,
                help='regular expression to match against message posting 
comment'
        )
        parser.add_argument(
-               'message',
+               '--message',
                type=re.compile,
-               help='regular expression to match against message content'
+               help='regular expression to match against message content 
(required)',
+               required=True
        )
        args = parser.parse_args()
        
@@ -78,6 +118,8 @@
                format='%(asctime)s %(levelname)-8s %(message)s',
                datefmt='%b-%d %H:%M:%S'
        )
+       logging.debug("Comment pattern is %r." % args.comment.pattern)
+       logging.debug("Message pattern is %r." % args.message.pattern)
        
        logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
        db = Database(
@@ -85,6 +127,27 @@
                db=args.db, 
                read_default_file=args.cnf
        )
+       
+       logging.info("Connecting to API @ %s." % args.api_uri)
+       api = WPAPI(args.api_uri)
+       
+       logging.info("Querying for matching revisions:")
+       count = {"matched": 0, "missed": 0}
+       for rev in db.getPostings(args.start, args.end, args.user_name, 
args.comment):
+               message = api.getAdded(rev['rev_id'])
+               match = args.message.search(message)
+               if match != None:
+                       rev['message_match'] = match.group(0)
+                       
+                       emit(rev)
+                       LOGGING_STREAM.write("|")
+                       count['matched'] += 1
+               else:
+                       LOGGING_STREAM.write("o")
+                       count['missed'] += 1
+               
+       LOGGING_STREAM.write("\n")
+       logging.info("Process completed. %(matched)s messages matched, 
%(missed)s messages missed." % count)
 
 
 
@@ -95,18 +158,36 @@
                self.kwargs = kwargs
                self.conn   = MySQLdb.connect(*args, **kwargs)
        
-       def getPostings(self, afterDate, commentPattern):
+       def getPostings(self, start, end, userName=None, commentRE=None):
+               if (userName, commentRE) == (None, None):
+                       raise TypeError("Must specify at at least one of 
userName or commentRE.")
+               
                cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
+               query = """
+                       SELECT 
+                               r.rev_id,
+                               r.rev_timestamp,
+                               r.rev_comment,
+                               r.rev_user                      AS poster_id,
+                               r.rev_user_text                 AS poster_name,
+                               REPLACE(p.page_title, "_", " ") AS 
recipient_name
+                       FROM revision r
+                       INNER JOIN page p ON r.rev_page = p.page_id
+                       WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
+                       AND page_namespace = 3
+                       """
+               if userName != None:
+                       query += "AND rev_user_text = %(user_name)s\n"
+               if commentRE != None:
+                       query += "AND rev_comment REGEXP %(comment_pattern)s\n"
+               
                cursor.execute(
-                       """
-                       SELECT * FROM
-                       FROM revision
-                       WHERE rev_timestamp > %(afterDate)s
-                       AND rev_comment REGEXP %(commentPattern)s
-                       """,
+                       query,
                        {
-                               'afterDate': afterDate,
-                               'commentPattern': commentPattern
+                               'start': start,
+                               'end': end,
+                               'user_name': userName,
+                               'comment_pattern': commentRE.pattern
                        }
                )
                
@@ -116,33 +197,65 @@
        
 
 class WPAPI:
+       DIFF_ADD_RE = re.compile(r'<td 
class="diff-addedline"><div>(.+)</div></td>')
        
        def __init__(self, uri):
                self.uri = uri
        
-       def getDiff(self, revId):
+       def getDiff(self, revId, retries=10):
+               attempt = 0
+               while attempt < retries:
+                       try:
+                               response = urllib2.urlopen(
+                                       self.uri,
+                                       urllib.urlencode({
+                                               'action': 'query',
+                                               'prop': 'revisions',
+                                               'revids': revId,
+                                               'rvprop': 'ids',
+                                               'rvdiffto': 'prev',
+                                               'format': 'json'
+                                       })
+                               )
+                               result = json.load(response)
+                               return 
result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
+                       except urllib2.HTTPError as e:
+                               time.sleep(attempt*2)
+                               attempt += 1
+                               
+                       
+       
+       def getAdded(self, revId):
+               diff = self.getDiff(revId)
                
-               response = urllib2.urlopen(
-                       self.uri,
-                       data=urllib.urlencode({
-                               'action': "query",
-                               'prop':   "revisions",
-                               'revids': revId,
-                               'rvprop': "diff",
-                               'format': "json"
-                       })
+               return self.unescape(
+                               "\n".join(
+                               match.group(1) 
+                               for match in WPAPI.DIFF_ADD_RE.finditer(diff)
+                       )
                )
                
-               js = json.load(response)
-               
-               
-               
-               try:
-                       if 'badrevids' in js['query']:
-                               raise KeyError(revId)
+       def unescape(self, text):
+               def fixup(m):
+                       text = m.group(0)
+                       if text[:2] == "&#":
+                               # character reference
+                               try:
+                                       if text[:3] == "&#x":
+                                               return unichr(int(text[3:-1], 
16))
+                                       else:
+                                               return unichr(int(text[2:-1]))
+                               except ValueError:
+                                       pass
                        else:
-                               return 
js['query']['pages'].values()[0]['revisions'][0]['diff']['*']
-               except KeyError:
-                       
+                               # named entity
+                               try:
+                                       text = 
unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+                               except KeyError:
+                                       pass
+                       return text # leave as is
+               return re.sub("&#?\w+;", fixup, text)
        
        
+if __name__ == "__main__": 
+       main()


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to