https://www.mediawiki.org/wiki/Special:Code/MediaWiki/107959

Revision: 107959
Author:   halfak
Date:     2012-01-03 23:12:51 +0000 (Tue, 03 Jan 2012)
Log Message:
-----------
Postings and metrics script working.  Metrics for editcounts, warnings, talk 
and blocks working, but not spot checked.

Modified Paths:
--------------
    trunk/tools/wsor/message_templates/sql/test.sql
    trunk/tools/wsor/message_templates/umetrics/generators/__init__.py
    trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py

Added Paths:
-----------
    trunk/tools/wsor/message_templates/metrics
    trunk/tools/wsor/message_templates/postings
    trunk/tools/wsor/message_templates/sample_postings.tsv
    trunk/tools/wsor/message_templates/umetrics/
    trunk/tools/wsor/message_templates/umetrics/__init__.py
    trunk/tools/wsor/message_templates/umetrics/generators/
    trunk/tools/wsor/message_templates/umetrics/generators/blocks.py
    trunk/tools/wsor/message_templates/umetrics/generators/talk.py
    trunk/tools/wsor/message_templates/umetrics/generators/warnings.py
    trunk/tools/wsor/message_templates/umetrics/metrics.py
    trunk/tools/wsor/message_templates/umetrics/postings.py
    trunk/tools/wsor/message_templates/umetrics/util/
    trunk/tools/wsor/message_templates/umetrics/util/__init__.py
    trunk/tools/wsor/message_templates/umetrics/util/mw_api.py

Removed Paths:
-------------
    trunk/tools/wsor/message_templates/generators/
    trunk/tools/wsor/message_templates/message_postings.py
    trunk/tools/wsor/message_templates/user_metrics.py

Deleted: trunk/tools/wsor/message_templates/message_postings.py
===================================================================
--- trunk/tools/wsor/message_templates/message_postings.py      2012-01-03 
23:09:54 UTC (rev 107958)
+++ trunk/tools/wsor/message_templates/message_postings.py      2012-01-03 
23:12:51 UTC (rev 107959)
@@ -1,282 +0,0 @@
-'''
-This script connects to a mediawiki database and API to collect User_talk 
revisions
-that match a set of patterns (and optionally, username).
-
-:Parameters:
-       Access the script's documentation for a parameter listing.
-       
-       % python message_postings.py --help
-
-:Output:
-       This script writes a set of escaped, tab separated columns to standard 
out.
-       - Recipient name - The name of the user who received the posting
-       - Timestamp - The time at which the posting was made
-       - Revision ID - The identifier of the revision matching the posting
-       - Poster ID - The identifier of the user who made the posting
-       - Poster name - The name of the user who make the posting
-       - Message match - The portion of the message posting that was matched 
by the regular expression.
-
-:Example:
-       python message_postings.py -h db42 --start=20111222000000 
--end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" 
--message="Template:uw-vandalism1"
-'''
-import sys, argparse, os
-import logging, types, re
-import time, datetime
-import MySQLdb, MySQLdb.cursors
-import urllib, urllib2, json, htmlentitydefs
-import wmf
-
-class MissingRevError(Exception):pass
-
-def encode(v):
-       if v == None: return "\N"
-       
-       if type(v) == types.LongType:     v = int(v)
-       elif type(v) == types.UnicodeType: v = v.encode('utf-8')
-       
-       return str(v).encode("string-escape")
-
-def emit(rev):
-       
-       print(
-               "\t".join(
-                       encode(rev[c]) for c in [
-                               'recipient_name',
-                               'rev_timestamp',
-                               'rev_id',
-                               'poster_id',
-                               'poster_name',
-                               'message_match'
-                       ]
-               )
-       )
-
-
-#  MediaWiki Date format
-#
-#                      |  year |   month |     day |    hour |  minute |  
second |
-MW_DATE = 
re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
-def mwDate(string):
-       if MW_DATE.match(string) == None:
-               raise ValueError("%r is not a valid date.  Expected 
YYMMDDHHmmSS" % string)
-       else:
-               return string
-
-def main():
-       parser = argparse.ArgumentParser(
-               description="""
-               Gathers experimental message postings from user_talk messages.
-               """,
-               epilog="""
-               python message_postings.py 
-               -h db42 
-               --start=20111222000000 
-               --end=20111223000000 
-               --comment="\(\[\[WP:HG\|HG\]\]\)" 
-               --message="Template:uw-vandalism1"
-               """,
-               conflict_handler="resolve"
-       )
-       parser.add_argument(
-               '-c', '--cnf',
-               metavar="<path>",
-               type=str, 
-               help='the path to MySQL config info (defaults to ~/.my.cnf)',
-               default=os.path.expanduser("~/.my.cnf")
-       )
-       parser.add_argument(
-               '-h', '--host',
-               type=str, 
-               help='the database host to connect to (defaults to localhost)',
-               default="localhost"
-       )
-       parser.add_argument(
-               '-d', '--db',
-               type=str, 
-               help='the language db to run the query in (defaults to enwiki)',
-               default="enwiki"
-       )
-       parser.add_argument(
-               '-a', '--api_uri',
-               type=str, 
-               help='the mediawiki API to connect to in order to retrieve 
message content (defaults to http://en.wikipedia.org/w/api.php)',
-               default="http://en.wikipedia.org/w/api.php";
-       )
-       parser.add_argument(
-               '--start',
-               type=mwDate,
-               help='the start of the experimental period. (Required)',
-               required=True
-       )
-       parser.add_argument(
-               '--end',
-               type=mwDate, 
-               help='the end of the experimental period.  (defaults to NOW())',
-               default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
-       )
-       parser.add_argument(
-               '--user_name',
-               type=str, 
-               help='the user_name to further filter postings by (useful for 
tracking bots)'
-       )
-       parser.add_argument(
-               '--comment',
-               type=re.compile,
-               help='regular expression to match against message posting 
comment'
-       )
-       parser.add_argument(
-               '--message',
-               type=re.compile,
-               help='regular expression to match against message content 
(required)',
-               required=True
-       )
-       args = parser.parse_args()
-       
-       LOGGING_STREAM = sys.stderr
-       logging.basicConfig(
-               level=logging.DEBUG,
-               stream=LOGGING_STREAM,
-               format='%(asctime)s %(levelname)-8s %(message)s',
-               datefmt='%b-%d %H:%M:%S'
-       )
-       logging.debug("Comment pattern is %r." % args.comment.pattern)
-       logging.debug("Message pattern is %r." % args.message.pattern)
-       
-       logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
-       db = Database(
-               host=args.host, 
-               db=args.db, 
-               read_default_file=args.cnf
-       )
-       
-       logging.info("Connecting to API @ %s." % args.api_uri)
-       api = WPAPI(args.api_uri)
-       
-       logging.info("Querying for matching revisions:")
-       count = {"matched": 0, "missed": 0}
-       for rev in db.getPostings(args.start, args.end, args.user_name, 
args.comment):
-               message = api.getAdded(rev['rev_id'])
-               match = args.message.search(message)
-               if match != None:
-                       rev['message_match'] = match.group(0)
-                       
-                       emit(rev)
-                       LOGGING_STREAM.write("|")
-                       count['matched'] += 1
-               else:
-                       LOGGING_STREAM.write("o")
-                       count['missed'] += 1
-               
-       LOGGING_STREAM.write("\n")
-       logging.info("Process completed. %(matched)s messages matched, 
%(missed)s messages missed." % count)
-
-
-
-class Database:
-       
-       def __init__(self, *args, **kwargs):
-               self.args   = args
-               self.kwargs = kwargs
-               self.conn   = MySQLdb.connect(*args, **kwargs)
-       
-       def getPostings(self, start, end, userName=None, commentRE=None):
-               if (userName, commentRE) == (None, None):
-                       raise TypeError("Must specify at at least one of 
userName or commentRE.")
-               
-               cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
-               query = """
-                       SELECT 
-                               r.rev_id,
-                               r.rev_timestamp,
-                               r.rev_comment,
-                               r.rev_user                      AS poster_id,
-                               r.rev_user_text                 AS poster_name,
-                               REPLACE(p.page_title, "_", " ") AS 
recipient_name
-                       FROM revision r
-                       INNER JOIN page p ON r.rev_page = p.page_id
-                       WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
-                       AND page_namespace = 3
-                       """
-               if userName != None:
-                       query += "AND rev_user_text = %(user_name)s\n"
-               if commentRE != None:
-                       query += "AND rev_comment REGEXP %(comment_pattern)s\n"
-               
-               cursor.execute(
-                       query,
-                       {
-                               'start': start,
-                               'end': end,
-                               'user_name': userName,
-                               'comment_pattern': commentRE.pattern
-                       }
-               )
-               
-               for row in cursor:
-                       yield row
-               
-       
-
-class WPAPI:
-       DIFF_ADD_RE = re.compile(r'<td 
class="diff-addedline"><div>(.+)</div></td>')
-       
-       def __init__(self, uri):
-               self.uri = uri
-       
-       def getDiff(self, revId, retries=10):
-               attempt = 0
-               while attempt < retries:
-                       try:
-                               response = urllib2.urlopen(
-                                       self.uri,
-                                       urllib.urlencode({
-                                               'action': 'query',
-                                               'prop': 'revisions',
-                                               'revids': revId,
-                                               'rvprop': 'ids',
-                                               'rvdiffto': 'prev',
-                                               'format': 'json'
-                                       })
-                               )
-                               result = json.load(response)
-                               return 
result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
-                       except urllib2.HTTPError as e:
-                               time.sleep(attempt*2)
-                               attempt += 1
-                               
-                       
-       
-       def getAdded(self, revId):
-               diff = self.getDiff(revId)
-               
-               return self.unescape(
-                               "\n".join(
-                               match.group(1) 
-                               for match in WPAPI.DIFF_ADD_RE.finditer(diff)
-                       )
-               )
-               
-       def unescape(self, text):
-               def fixup(m):
-                       text = m.group(0)
-                       if text[:2] == "&#":
-                               # character reference
-                               try:
-                                       if text[:3] == "&#x":
-                                               return unichr(int(text[3:-1], 
16))
-                                       else:
-                                               return unichr(int(text[2:-1]))
-                               except ValueError:
-                                       pass
-                       else:
-                               # named entity
-                               try:
-                                       text = 
unichr(htmlentitydefs.name2codepoint[text[1:-1]])
-                               except KeyError:
-                                       pass
-                       return text # leave as is
-               return re.sub("&#?\w+;", fixup, text)
-       
-       
-if __name__ == "__main__": 
-       main()

Added: trunk/tools/wsor/message_templates/metrics
===================================================================
--- trunk/tools/wsor/message_templates/metrics                          (rev 0)
+++ trunk/tools/wsor/message_templates/metrics  2012-01-03 23:12:51 UTC (rev 
107959)
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+from umetrics.metrics import main;main()


Property changes on: trunk/tools/wsor/message_templates/metrics
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/tools/wsor/message_templates/postings
===================================================================
--- trunk/tools/wsor/message_templates/postings                         (rev 0)
+++ trunk/tools/wsor/message_templates/postings 2012-01-03 23:12:51 UTC (rev 
107959)
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+from umetrics.postings import main;main()


Property changes on: trunk/tools/wsor/message_templates/postings
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/tools/wsor/message_templates/sample_postings.tsv
===================================================================
--- trunk/tools/wsor/message_templates/sample_postings.tsv                      
        (rev 0)
+++ trunk/tools/wsor/message_templates/sample_postings.tsv      2012-01-03 
23:12:51 UTC (rev 107959)
@@ -0,0 +1,122 @@
+99.92.179.246  20111222000623  467110760       205121  Koavf   
Template:uw-vandalism1
+206.51.176.121 20111222002717  467113472       7695475 Jim1138 
Template:uw-vandalism1
+96.255.37.252  20111222002944  467113720       7695475 Jim1138 
Template:uw-vandalism1
+88.207.182.230 20111222003427  467114243       7695475 Jim1138 
Template:uw-vandalism1
+75.28.52.111   20111222004219  467115072       8371165 AbigailAbernathy        
Template:uw-vandalism1
+69.142.218.3   20111222004409  467115300       7695475 Jim1138 
Template:uw-vandalism1
+65.96.250.23   20111222004727  467115720       7695475 Jim1138 
Template:uw-vandalism1
+69.158.17.157  20111222004835  467115846       7695475 Jim1138 
Template:uw-vandalism1
+99.226.152.51  20111222005322  467116380       8371165 AbigailAbernathy        
Template:uw-vandalism1
+50.90.114.25   20111222005336  467116401       8371165 AbigailAbernathy        
Template:uw-vandalism1
+24.218.185.51  20111222010055  467117261       8371165 AbigailAbernathy        
Template:uw-vandalism1
+99.60.53.154   20111222012001  467119343       8371165 AbigailAbernathy        
Template:uw-vandalism1
+174.57.17.84   20111222012338  467119703       8371165 AbigailAbernathy        
Template:uw-vandalism1
+PMPC   20111222012759  467120150       7695475 Jim1138 Template:uw-vandalism1
+109.77.113.235 20111222012830  467120200       7695475 Jim1138 
Template:uw-vandalism1
+108.130.64.2   20111222012855  467120243       7695475 Jim1138 
Template:uw-vandalism1
+173.74.248.50  20111222013909  467121294       7695475 Jim1138 
Template:uw-vandalism1
+218.24.165.201 20111222014206  467121631       7695475 Jim1138 
Template:uw-vandalism1
+108.48.90.203  20111222014414  467121875       7695475 Jim1138 
Template:uw-vandalism1
+183.177.191.220        20111222015126  467122559       7695475 Jim1138 
Template:uw-vandalism1
+66.168.26.250  20111222023031  467125938       15020596        Mark Arsten     
Template:uw-vandalism1
+68.98.168.116  20111222023202  467126066       7821268 RandomAct       
Template:uw-vandalism1
+72.184.167.111 20111222023634  467126522       15020596        Mark Arsten     
Template:uw-vandalism1
+82.11.48.151   20111222023654  467126555       15020596        Mark Arsten     
Template:uw-vandalism1
+76.69.9.18     20111222023843  467126740       15020596        Mark Arsten     
Template:uw-vandalism1
+76.179.88.134  20111222024014  467126906       7821268 RandomAct       
Template:uw-vandalism1
+223.228.169.111        20111222024511  467127446       15020596        Mark 
Arsten     Template:uw-vandalism1
+70.73.15.114   20111222024531  467127488       15020596        Mark Arsten     
Template:uw-vandalism1
+115.240.209.58 20111222024600  467127532       15020596        Mark Arsten     
Template:uw-vandalism1
+198.151.130.54 20111222024729  467127691       7821268 RandomAct       
Template:uw-vandalism1
+184.37.78.85   20111222024756  467127733       15020596        Mark Arsten     
Template:uw-vandalism1
+112.210.239.1  20111222025209  467128144       15020596        Mark Arsten     
Template:uw-vandalism1
+71.227.110.218 20111222025853  467128908       7821268 RandomAct       
Template:uw-vandalism1
+70.127.78.61   20111222030015  467129038       15020596        Mark Arsten     
Template:uw-vandalism1
+50.9.34.247    20111222030027  467129069       15020596        Mark Arsten     
Template:uw-vandalism1
+68.1.183.212   20111222030559  467129619       15020596        Mark Arsten     
Template:uw-vandalism1
+67.128.239.58  20111222030620  467129655       15020596        Mark Arsten     
Template:uw-vandalism1
+Carenblake     20111222030640  467129689       15020596        Mark Arsten     
Template:uw-vandalism1
+2.101.130.254  20111222030856  467129927       15020596        Mark Arsten     
Template:uw-vandalism1
+68.68.187.69   20111222031254  467130311       7695475 Jim1138 
Template:uw-vandalism1
+75.86.201.236  20111222031313  467130337       15020596        Mark Arsten     
Template:uw-vandalism1
+74.132.43.218  20111222031420  467130443       15020596        Mark Arsten     
Template:uw-vandalism1
+142.68.160.131 20111222031625  467130652       15020596        Mark Arsten     
Template:uw-vandalism1
+Blueturtle2    20111222031752  467130815       7695475 Jim1138 
Template:uw-vandalism1
+69.153.186.30  20111222032159  467131245       15020596        Mark Arsten     
Template:uw-vandalism1
+24.191.10.180  20111222032237  467131326       15020596        Mark Arsten     
Template:uw-vandalism1
+89.124.240.80  20111222032924  467132037       15020596        Mark Arsten     
Template:uw-vandalism1
+76.202.230.190 20111222033021  467132144       7695475 Jim1138 
Template:uw-vandalism1
+98.154.111.227 20111222033157  467132313       15020596        Mark Arsten     
Template:uw-vandalism1
+99.119.25.29   20111222033215  467132346       15020596        Mark Arsten     
Template:uw-vandalism1
+72.67.11.201   20111222033240  467132395       15020596        Mark Arsten     
Template:uw-vandalism1
+108.84.217.178 20111222033339  467132499       15020596        Mark Arsten     
Template:uw-vandalism1
+98.254.245.97  20111222033347  467132515       15020596        Mark Arsten     
Template:uw-vandalism1
+116.68.248.117 20111222033402  467132545       7695475 Jim1138 
Template:uw-vandalism1
+68.48.81.29    20111222033407  467132557       15020596        Mark Arsten     
Template:uw-vandalism1
+69.14.32.169   20111222033431  467132594       15020596        Mark Arsten     
Template:uw-vandalism1
+76.172.11.143  20111222033513  467132652       15020596        Mark Arsten     
Template:uw-vandalism1
+207.255.163.58 20111222033517  467132663       7695475 Jim1138 
Template:uw-vandalism1
+82.37.109.26   20111222033535  467132688       15020596        Mark Arsten     
Template:uw-vandalism1
+24.1.86.54     20111222033637  467132794       15020596        Mark Arsten     
Template:uw-vandalism1
+68.229.166.67  20111222033749  467132911       15020596        Mark Arsten     
Template:uw-vandalism1
+68.197.139.18  20111222034204  467133304       7695475 Jim1138 
Template:uw-vandalism1
+67.241.26.211  20111222034557  467133679       15020596        Mark Arsten     
Template:uw-vandalism1
+71.72.129.150  20111222034630  467133724       7695475 Jim1138 
Template:uw-vandalism1
+99.112.124.88  20111222034726  467133802       7695475 Jim1138 
Template:uw-vandalism1
+4.254.81.152   20111222034915  467133967       7695475 Jim1138 
Template:uw-vandalism1
+71.191.34.185  20111222035000  467134028       15020596        Mark Arsten     
Template:uw-vandalism1
+108.132.160.105        20111222035215  467134272       15020596        Mark 
Arsten     Template:uw-vandalism1
+174.113.229.186        20111222035227  467134295       15020596        Mark 
Arsten     Template:uw-vandalism1
+86.19.242.120  20111222035300  467134348       15020596        Mark Arsten     
Template:uw-vandalism1
+173.176.118.87 20111222035508  467134517       15020596        Mark Arsten     
Template:uw-vandalism1
+114.142.166.229        20111222035601  467134589       15020596        Mark 
Arsten     Template:uw-vandalism1
+76.25.211.198  20111222035646  467134656       7695475 Jim1138 
Template:uw-vandalism1
+68.3.112.168   20111222035745  467134737       7695475 Jim1138 
Template:uw-vandalism1
+67.20.133.136  20111222042426  467136925       7695475 Jim1138 
Template:uw-vandalism1
+68.5.93.197    20111222042635  467137125       7695475 Jim1138 
Template:uw-vandalism1
+50.135.29.111  20111222042958  467137392       7695475 Jim1138 
Template:uw-vandalism1
+99.91.215.6    20111222043424  467137772       7695475 Jim1138 
Template:uw-vandalism1
+Eastderp1      20111222044200  467138536       7695475 Jim1138 
Template:uw-vandalism1
+173.26.101.78  20111222044530  467138890       7695475 Jim1138 
Template:uw-vandalism1
+80.194.26.184  20111222050300  467140546       7695475 Jim1138 
Template:uw-vandalism1
+69.230.195.135 20111222050355  467140632       7695475 Jim1138 
Template:uw-vandalism1
+Violinmagician 20111222050359  467140640       7695475 Jim1138 
Template:uw-vandalism1
+76.166.147.85  20111222050726  467140963       7695475 Jim1138 
Template:uw-vandalism1
+72.229.150.7   20111222050921  467141171       7695475 Jim1138 
Template:uw-vandalism1
+96.250.109.41  20111222051253  467141520       7695475 Jim1138 
Template:uw-vandalism1
+70.249.216.200 20111222051603  467141788       7695475 Jim1138 
Template:uw-vandalism1
+76.114.237.52  20111222051804  467142013       7695475 Jim1138 
Template:uw-vandalism1
+99.231.38.15   20111222052155  467142435       7695475 Jim1138 
Template:uw-vandalism1
+70.232.36.209  20111222052236  467142502       58193   OverlordQ       
Template:uw-vandalism1
+70.171.84.236  20111222055646  467145424       7695475 Jim1138 
Template:uw-vandalism1
+117.229.124.35 20111222060552  467146249       7695475 Jim1138 
Template:uw-vandalism1
+92.8.86.211    20111222061442  467147022       7695475 Jim1138 
Template:uw-vandalism1
+Jonnygharris   20111222063512  467148666       2359527 Tgeairn 
Template:uw-vandalism1
+68.32.26.30    20111222064442  467149456       7695475 Jim1138 
Template:uw-vandalism1
+122.172.47.49  20111222064608  467149587       7695475 Jim1138 
Template:uw-vandalism1
+68.67.110.139  20111222064632  467149618       2359527 Tgeairn 
Template:uw-vandalism1
+58.68.46.210   20111222070300  467150889       2359527 Tgeairn 
Template:uw-vandalism1
+120.56.171.37  20111222070459  467151018       7695475 Jim1138 
Template:uw-vandalism1
+24.185.100.176 20111222070521  467151047       2359527 Tgeairn 
Template:uw-vandalism1
+94.76.32.252   20111222070626  467151122       7695475 Jim1138 
Template:uw-vandalism1
+71.2.35.65     20111222070954  467151442       2359527 Tgeairn 
Template:uw-vandalism1
+203.52.228.196 20111222071602  467152062       7695475 Jim1138 
Template:uw-vandalism1
+124.84.101.186 20111222072924  467153017       2359527 Tgeairn 
Template:uw-vandalism1
+74.120.224.200 20111222073655  467153546       7695475 Jim1138 
Template:uw-vandalism1
+70.230.154.187 20111222074036  467153814       7695475 Jim1138 
Template:uw-vandalism1
+69.226.149.86  20111222074434  467154103       7695475 Jim1138 
Template:uw-vandalism1
+75.131.131.219 20111222075957  467155209       7695475 Jim1138 
Template:uw-vandalism1
+49.245.133.171 20111222080029  467155275       7695475 Jim1138 
Template:uw-vandalism1
+152.118.24.10  20111222080106  467155326       7695475 Jim1138 
Template:uw-vandalism1
+99.224.122.205 20111222081721  467156446       7695475 Jim1138 
Template:uw-vandalism1
+198.240.133.75 20111222082450  467156945       7695475 Jim1138 
Template:uw-vandalism1
+110.77.227.221 20111222083046  467157456       7695475 Jim1138 
Template:uw-vandalism1
+2.188.4.3      20111222083310  467157657       7695475 Jim1138 
Template:uw-vandalism1
+145.103.249.34 20111222084352  467158487       7695475 Jim1138 
Template:uw-vandalism1
+50.64.10.116   20111222084516  467158585       7695475 Jim1138 
Template:uw-vandalism1
+76.67.17.205   20111222085448  467159343       7695475 Jim1138 
Template:uw-vandalism1
+120.144.129.107        20111222092827  467162126       7695475 Jim1138 
Template:uw-vandalism1
+14.139.243.229 20111222093150  467162401       7695475 Jim1138 
Template:uw-vandalism1
+180.149.52.45  20111222093649  467162851       7695475 Jim1138 
Template:uw-vandalism1
+24.191.25.201  20111222094616  467163617       7695475 Jim1138 
Template:uw-vandalism1
+71.227.10.224  20111222095309  467164127       7695475 Jim1138 
Template:uw-vandalism1

Modified: trunk/tools/wsor/message_templates/sql/test.sql
===================================================================
--- trunk/tools/wsor/message_templates/sql/test.sql     2012-01-03 23:09:54 UTC 
(rev 107958)
+++ trunk/tools/wsor/message_templates/sql/test.sql     2012-01-03 23:12:51 UTC 
(rev 107959)
@@ -1 +1,47 @@
+(
+       SELECT
+               False as deleted,
+               page_namespace as ns,
+               count(*) as revisions
+       FROM enwiki.revision
+       INNER JOIN enwiki.page ON rev_page = page_id
+       WHERE rev_timestamp <= "20110101000000"
+       AND rev_user_text = "EpochFail"
+       GROUP BY page_namespace
+)
+UNION
+(
+       SELECT
+               True as deleted,
+               ar_namespace as ns,
+               count(*) as revisions
+       FROM enwiki.archive
+       WHERE ar_timestamp <= "20110101000000"
+       AND ar_user_text = "EpochFail"
+       GROUP BY ar_namespace
+)
 
+SELECT 
+       r.rev_id,
+       r.rev_timestamp,
+       r.rev_comment,
+       r.rev_user                      AS poster_id,
+       r.rev_user_text                 AS poster_name,
+       REPLACE(p.page_title, "_", " ") AS recipient_name
+FROM revision r
+INNER JOIN page p ON r.rev_page = p.page_id
+WHERE rev_timestamp BETWEEN "20111222000000" AND "20111223000000"
+AND page_namespace = 3;
+
+
+SELECT
+       IF(log_params LIKE "%indefinite%", "ban", "block") as type,
+       IF(log_timestamp > "20110101000000", "after", "before") as tense,
+       count(*) as count,
+       min(log_timestamp) as first,
+       max(log_timestamp) as last
+FROM logging
+WHERE log_type = "block"
+AND log_action = "block"
+AND log_title = "EpochFail"
+GROUP BY 1, 2;

Added: trunk/tools/wsor/message_templates/umetrics/__init__.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/__init__.py                     
        (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/__init__.py     2012-01-03 
23:12:51 UTC (rev 107959)
@@ -0,0 +1 @@
+

Modified: trunk/tools/wsor/message_templates/umetrics/generators/__init__.py
===================================================================
--- trunk/tools/wsor/message_templates/generators/__init__.py   2012-01-02 
20:26:32 UTC (rev 107849)
+++ trunk/tools/wsor/message_templates/umetrics/generators/__init__.py  
2012-01-03 23:12:51 UTC (rev 107959)
@@ -1,8 +1,14 @@
 from .edit_counts import EditCounts
+from .talk import Talk
+from .blocks import Blocks
+from .warnings import Warnings
 from .metric_generator import MetricGenerator
 
 GENERATORS = {
-       'editcounts': EditCounts
+       'editcounts': EditCounts,
+       'talk': Talk,
+       'blocks': Blocks,
+       'warnings': Warnings
 }
 
 class Metrics(MetricGenerator):

Added: trunk/tools/wsor/message_templates/umetrics/generators/blocks.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/generators/blocks.py            
                (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/generators/blocks.py    
2012-01-03 23:12:51 UTC (rev 107959)
@@ -0,0 +1,57 @@
+import itertools
+from .metric_generator import MetricGenerator 
+
+class Blocks(MetricGenerator):
+       
+       def __init__(self, conn, api):
+               self.conn = conn
+       
+       def headers(self):
+               return [
+                       'blocks_before',
+                       'blocks_after',
+                       'first_block_before',
+                       'last_block_before',
+                       'first_block_after',
+                       'last_block_after',
+                       'bans_before',
+                       'bans_after',
+                       'first_ban_before',
+                       'last_ban_before',
+                       'first_ban_after',
+                       'last_ban_after'
+               ]
+       
+       def values(self, username, timestamp):
+               rowValues = {}
+               
+               cursor = self.conn.cursor()
+               cursor.execute("""
+                               SELECT
+                                       IF(log_params LIKE "%%indefinite%%", 
"ban", "block") as type,
+                                       IF(log_timestamp > %(timestamp)s, 
"after", "before") as whense,
+                                       count(*) as count,
+                                       min(log_timestamp) as first,
+                                       max(log_timestamp) as last
+                               FROM logging
+                               WHERE log_type = "block"
+                               AND log_action = "block"
+                               AND log_title = %(username)s
+                               GROUP BY 1, 2
+                       """,
+                       {
+                               'timestamp': timestamp,
+                               'username': username.encode('utf-8').replace(" 
", "_")
+                       }
+               )
+               for row in cursor:
+                       rowValues['%(type)ss_%(whense)s' % row] = row['count']
+                       rowValues['first_%(type)s_%(whense)s' % row] = 
row['first']
+                       rowValues['last_%(type)s_%(whense)s' % row] = 
row['last']
+               
+               rowValues['blocks_before'] = rowValues.get('blocks_before', 0)
+               rowValues['blocks_after']  = rowValues.get('blocks_after', 0)
+               rowValues['bans_before']   = rowValues.get('bans_before', 0)
+               rowValues['bans_after']    = rowValues.get('bans_after', 0)
+                       
+               return [rowValues.get(c) for c in self.headers()]

Modified: trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py
===================================================================
--- trunk/tools/wsor/message_templates/generators/edit_counts.py        
2012-01-02 20:26:32 UTC (rev 107849)
+++ trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py       
2012-01-03 23:12:51 UTC (rev 107959)
@@ -3,16 +3,16 @@
 
 class EditCounts(MetricGenerator):
        
-       def __init__(self, conn, api_uri):
+       def __init__(self, conn, api):
                self.conn = conn
        
        def headers(self):
                return itertools.chain(*[
                        [
-                               'ns_%s_before_revisions_deleted' % ns,
-                               'ns_%s_after_revisions_deleted' % ns,
-                               'ns_%s_before_revisions_not_deleted' % ns,
-                               'ns_%s_after_revisions_not_deleted' % ns
+                               'ns_%s_revisions_deleted_before' % ns,
+                               'ns_%s_revisions_deleted_after' % ns,
+                               'ns_%s_revisions_not_deleted_before' % ns,
+                               'ns_%s_revisions_not_deleted_after' % ns
                        ]
                        for ns in itertools.chain(range(0,16), [100, 101, 108, 
109])
                ])

Added: trunk/tools/wsor/message_templates/umetrics/generators/talk.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/generators/talk.py              
                (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/generators/talk.py      
2012-01-03 23:12:51 UTC (rev 107959)
@@ -0,0 +1,51 @@
+import itertools
+from .metric_generator import MetricGenerator 
+
+class Talk(MetricGenerator):
+       
+       def __init__(self, conn, api):
+               self.conn = conn
+       
+       def headers(self):
+               return [
+                       'other_talk_before',
+                       'first_other_talk_before',
+                       'last_other_talk_before',
+                       'other_talk_after',
+                       'first_other_talk_after',
+                       'last_other_talk_after',
+               ]
+       
+       def values(self, username, timestamp):
+               rowValues = {}
+               
+               cursor = self.conn.cursor()
+               cursor.execute("""
+                               SELECT
+                                       IF(rev_timestamp > %(timestamp)s, 
"after", "before") as whense,
+                                       COUNT(*) as count,
+                                       MAX(rev_timestamp) as last,
+                                       MIN(rev_timestamp) as first
+                               FROM revision
+                               INNER JOIN page ON rev_page = page_id
+                               WHERE page_namespace = 3
+                               AND rev_timestamp != %(timestamp)s
+                               AND page_title = %(page_title)s
+                               AND rev_user_text != %(username)s
+                               GROUP BY 1
+                       """,
+                       {
+                               'timestamp': timestamp,
+                               'page_title': 
username.encode('utf-8').replace(" ", "_"),
+                               'username': username.encode('utf-8')
+                       }
+               )
+               for row in cursor:
+                       rowValues['other_talk_%(whence)s'] = row['count']
+                       rowValues['first_other_talk_%(whence)s'] = row['first']
+                       rowValues['last_other_talk_%(whence)s'] = row['last']
+               
+               rowValues['other_talk_before'] = 
rowValues.get('other_talk_before', 0)
+               rowValues['other_talk_after']  = 
rowValues.get('other_talk_after', 0)
+                       
+               return [rowValues.get(c) for c in self.headers()]

Added: trunk/tools/wsor/message_templates/umetrics/generators/warnings.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/generators/warnings.py          
                (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/generators/warnings.py  
2012-01-03 23:12:51 UTC (rev 107959)
@@ -0,0 +1,97 @@
+import itertools, wmf, difflib, re
+from .metric_generator import MetricGenerator
+
+
+class Warnings(MetricGenerator):
+       
+       WARN_RE = re.compile(r'<!--\s*Template:uw-')
+       
+       def __init__(self, conn, api):
+               self.conn = conn
+               self.api  = api
+       
+       def headers(self):
+               return [
+                       'warns_before',
+                       'warns_after',
+                       'first_warn_before',
+                       'last_warn_before',
+                       'first_warn_after',
+                       'last_warn_after'
+               ]
+       
+       def values(self, username, timestamp):
+               rowValues = {
+                       'warns_before':      0,
+                       'warns_after':       0
+               }
+               
+               timestamp = wmf.wp2Timestamp(timestamp)
+               
+               for rev in self.getProcessedRevs(username):
+                       #determine if we have a warning
+                       if self.WARN_RE.search(rev['added']) != None:
+                               if rev['timestamp'] < timestamp:
+                                       whence = "before"
+                               elif rev['timestamp'] > timestamp:
+                                       whence = "after"
+                               else:
+                                       continue
+                               
+                               rowValues['warns_%s' % whence] += 1
+                               
+                               if 'first_warn_%s' % whence not in rowValues:
+                                       rowValues['first_warn_%s' % whence] = 
wmf.timestamp2WP(rev['timestamp'])
+                               
+                               rowValues['last_warn_%s' % whence] = 
wmf.timestamp2WP(rev['timestamp'])
+               
+               return [rowValues.get(c) for c in self.headers()]
+       
+       def getProcessedRevs(self, username):
+               return self.processRevs(self.getUserPageRevisions(username))
+       
+       def getUserPageRevisions(self, username, rvcontinue=None):
+               js = self.api.request(
+                       action="query",
+                       prop="revisions",
+                       titles="User_talk:%s" % username,
+                       rvprop="ids|timestamp|content",
+                       rvdir="newer",
+                       rvlimit=50,
+                       rvcontinue=rvcontinue
+               )
+               
+               for rev in js['query']['pages'].values()[0]['revisions']:
+                       rev['timestamp']
+                       yield rev
+               
+               if 'query-continue' in js:
+                       for rev in self.getUserPageRevisions(username, 
js['query-continue']['revisions']['rvstartid']):
+                               yield rev
+                       
+               
+       
+       def processRevs(self, revs):
+               
+               previousLines = []
+               for rev in revs:
+                       lines = rev.get('*', "").split("\n")
+                       del rev['*']
+                       
+                       added = []
+                       sm = difflib.SequenceMatcher(None, previousLines, lines)
+                       for tag, i1, i2, j1, j2 in sm.get_opcodes():
+                               if tag == "insert":
+                                       added.extend(lines[j1:j2])
+                               elif tag == "replace":
+                                       added.extend(lines[j1:j2])
+                               
+                       
+                       rev['added'] = "\n".join(added)
+                       rev['timestamp'] = wmf.wp2Timestamp(rev['timestamp'])
+                       yield rev
+                       previousLines = lines
+                       
+               
+                       
+               

Copied: trunk/tools/wsor/message_templates/umetrics/metrics.py (from rev 
107849, trunk/tools/wsor/message_templates/user_metrics.py)
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/metrics.py                      
        (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/metrics.py      2012-01-03 
23:12:51 UTC (rev 107959)
@@ -0,0 +1,100 @@
+import sys, argparse, os
+import logging, types
+import MySQLdb, MySQLdb.cursors
+
+from .generators import GENERATORS, Metrics
+from .util import MWAPI, MWAPIError
+
+def encode(v):
+       if v == None: return "\N"
+       
+       if type(v) == types.LongType:     v = int(v)
+       elif type(v) == types.UnicodeType: v = v.encode('utf-8')
+       
+       return str(v).encode("string-escape")
+
+
+def main():
+       
+       parser = argparse.ArgumentParser(
+               description="""
+               Gathers metrics for users around a timestamp.
+               """,
+               conflict_handler="resolve"
+       )
+       parser.add_argument(
+               '-c', '--cnf',
+               metavar="<path>",
+               type=str, 
+               help='the path to MySQL config info (defaults to ~/.my.cnf)',
+               default=os.path.expanduser("~/.my.cnf")
+       )
+       parser.add_argument(
+               '-h', '--host',
+               type=str, 
+               help='the database host to connect to (defaults to localhost)',
+               default="localhost"
+       )
+       parser.add_argument(
+               '-d', '--db',
+               type=str, 
+               help='the language db to run the query in (defaults to enwiki)',
+               default="enwiki"
+       )
+       parser.add_argument(
+               '-a', '--api',
+               type=MWAPI, 
+               help='the mediawiki API to connect to in order to retrieve 
message content (defaults to http://en.wikipedia.org/w/api.php)',
+               default="http://en.wikipedia.org/w/api.php";
+       )
+       parser.add_argument(
+               'generator',
+               type=lambda g: GENERATORS[g],
+               nargs="+",
+               help='the metric generators to run (%s)' % ', 
'.join(GENERATORS.keys())
+       )
+       args = parser.parse_args()
+       
+       LOGGING_STREAM = sys.stderr
+       logging.basicConfig(
+               level=logging.DEBUG,
+               stream=LOGGING_STREAM,
+               format='%(asctime)s %(levelname)-8s %(message)s',
+               datefmt='%b-%d %H:%M:%S'
+       )
+       
+       if sys.stdin.isatty():
+               logging.error("No data piped to standard in!")
+               return
+       
+       
+       logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
+       conn = MySQLdb.connect(
+               host=args.host, 
+               db=args.db, 
+               read_default_file=args.cnf,
+               cursorclass=MySQLdb.cursors.DictCursor
+       )
+       
+       logging.info("Loading generators...")
+       metrics = Metrics(g(conn, args.api) for g in args.generator)
+       print("\t".join(encode(h) for h in metrics.headers()))
+       
+       
+       logging.info("Processing users...")
+       for line in sys.stdin:
+               username, timestamp = line.strip().split("\t")[0:2]
+               username = unicode(username, 'utf-8')
+               
+               logging.debug("\t%s at %s:" % (username, timestamp))
+               print("\t".join(encode(v) for v in metrics.values(username, 
timestamp)))
+               LOGGING_STREAM.write("o")
+               
+       LOGGING_STREAM.write("\n")
+       
+
+
+
+       
+if __name__ == "__main__": 
+       main()

Copied: trunk/tools/wsor/message_templates/umetrics/postings.py (from rev 
107849, trunk/tools/wsor/message_templates/message_postings.py)
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/postings.py                     
        (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/postings.py     2012-01-03 
23:12:51 UTC (rev 107959)
@@ -0,0 +1,282 @@
+'''
+This script connects to a mediawiki database and API to collect User_talk 
revisions
+that match a set of patterns (and optionally, username).
+
+:Parameters:
+       Access the script's documentation for a parameter listing.
+       
+       % python message_postings.py --help
+
+:Output:
+       This script writes a set of escaped, tab separated columns to standard 
out.
+       - Recipient name - The name of the user who received the posting
+       - Timestamp - The time at which the posting was made
+       - Revision ID - The identifier of the revision matching the posting
+       - Poster ID - The identifier of the user who made the posting
+       - Poster name - The name of the user who make the posting
+       - Message match - The portion of the message posting that was matched 
by the regular expression.
+
+:Example:
+       python message_postings.py -h db42 --start=20111222000000 
--end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" 
--message="Template:uw-vandalism1"
+'''
+import sys, argparse, os
+import logging, types, re
+import time, datetime
+import MySQLdb, MySQLdb.cursors
+import urllib, urllib2, json, htmlentitydefs
+import wmf
+
+class MissingRevError(Exception):pass
+
+def encode(v):
+       if v == None: return "\N"
+       
+       if type(v) == types.LongType:     v = int(v)
+       elif type(v) == types.UnicodeType: v = v.encode('utf-8')
+       
+       return str(v).encode("string-escape")
+
+def emit(rev):
+       
+       print(
+               "\t".join(
+                       encode(rev[c]) for c in [
+                               'recipient_name',
+                               'rev_timestamp',
+                               'rev_id',
+                               'poster_id',
+                               'poster_name',
+                               'message_match'
+                       ]
+               )
+       )
+
+
+#  MediaWiki Date format
+#
+#                      |  year |   month |     day |    hour |  minute |  
second |
+MW_DATE = 
re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
+def mwDate(string):
+       if MW_DATE.match(string) == None:
+               raise ValueError("%r is not a valid date.  Expected 
YYMMDDHHmmSS" % string)
+       else:
+               return string
+
+def main():
+       parser = argparse.ArgumentParser(
+               description="""
+               Gathers experimental message postings from user_talk messages.
+               """,
+               epilog="""
+               python message_postings.py 
+               -h db42 
+               --start=20111222000000 
+               --end=20111223000000 
+               --comment="\(\[\[WP:HG\|HG\]\]\)" 
+               --message="Template:uw-vandalism1"
+               """,
+               conflict_handler="resolve"
+       )
+       parser.add_argument(
+               '-c', '--cnf',
+               metavar="<path>",
+               type=str, 
+               help='the path to MySQL config info (defaults to ~/.my.cnf)',
+               default=os.path.expanduser("~/.my.cnf")
+       )
+       parser.add_argument(
+               '-h', '--host',
+               type=str, 
+               help='the database host to connect to (defaults to localhost)',
+               default="localhost"
+       )
+       parser.add_argument(
+               '-d', '--db',
+               type=str, 
+               help='the language db to run the query in (defaults to enwiki)',
+               default="enwiki"
+       )
+       parser.add_argument(
+               '-a', '--api_uri',
+               type=str, 
+               help='the mediawiki API to connect to in order to retrieve 
message content (defaults to http://en.wikipedia.org/w/api.php)',
+               default="http://en.wikipedia.org/w/api.php";
+       )
+       parser.add_argument(
+               '--start',
+               type=mwDate,
+               help='the start of the experimental period. (Required)',
+               required=True
+       )
+       parser.add_argument(
+               '--end',
+               type=mwDate, 
+               help='the end of the experimental period.  (defaults to NOW())',
+               default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
+       )
+       parser.add_argument(
+               '--user_name',
+               type=str, 
+               help='the user_name to further filter postings by (useful for 
tracking bots)'
+       )
+       parser.add_argument(
+               '--comment',
+               type=re.compile,
+               help='regular expression to match against message posting 
comment'
+       )
+       parser.add_argument(
+               '--message',
+               type=re.compile,
+               help='regular expression to match against message content 
(required)',
+               required=True
+       )
+       args = parser.parse_args()
+       
+       LOGGING_STREAM = sys.stderr
+       logging.basicConfig(
+               level=logging.DEBUG,
+               stream=LOGGING_STREAM,
+               format='%(asctime)s %(levelname)-8s %(message)s',
+               datefmt='%b-%d %H:%M:%S'
+       )
+       logging.debug("Comment pattern is %r." % args.comment.pattern)
+       logging.debug("Message pattern is %r." % args.message.pattern)
+       
+       logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
+       db = Database(
+               host=args.host, 
+               db=args.db, 
+               read_default_file=args.cnf
+       )
+       
+       logging.info("Connecting to API @ %s." % args.api_uri)
+       api = WPAPI(args.api_uri)
+       
+       logging.info("Querying for matching revisions:")
+       count = {"matched": 0, "missed": 0}
+       for rev in db.getPostings(args.start, args.end, args.user_name, 
args.comment):
+               message = api.getAdded(rev['rev_id'])
+               match = args.message.search(message)
+               if match != None:
+                       rev['message_match'] = match.group(0)
+                       
+                       emit(rev)
+                       LOGGING_STREAM.write("|")
+                       count['matched'] += 1
+               else:
+                       LOGGING_STREAM.write("o")
+                       count['missed'] += 1
+               
+       LOGGING_STREAM.write("\n")
+       logging.info("Process completed. %(matched)s messages matched, 
%(missed)s messages missed." % count)
+
+
+
+class Database:
+       
+       def __init__(self, *args, **kwargs):
+               self.args   = args
+               self.kwargs = kwargs
+               self.conn   = MySQLdb.connect(*args, **kwargs)
+       
+       def getPostings(self, start, end, userName=None, commentRE=None):
+               if (userName, commentRE) == (None, None):
+                       raise TypeError("Must specify at at least one of 
userName or commentRE.")
+               
+               cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
+               query = """
+                       SELECT 
+                               r.rev_id,
+                               r.rev_timestamp,
+                               r.rev_comment,
+                               r.rev_user                      AS poster_id,
+                               r.rev_user_text                 AS poster_name,
+                               REPLACE(p.page_title, "_", " ") AS 
recipient_name
+                       FROM revision r
+                       INNER JOIN page p ON r.rev_page = p.page_id
+                       WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
+                       AND page_namespace = 3
+                       """
+               if userName != None:
+                       query += "AND rev_user_text = %(user_name)s\n"
+               if commentRE != None:
+                       query += "AND rev_comment REGEXP %(comment_pattern)s\n"
+               
+               cursor.execute(
+                       query,
+                       {
+                               'start': start,
+                               'end': end,
+                               'user_name': userName,
+                               'comment_pattern': commentRE.pattern
+                       }
+               )
+               
+               for row in cursor:
+                       yield row
+               
+       
+
+class WPAPI:
+       DIFF_ADD_RE = re.compile(r'<td 
class="diff-addedline"><div>(.+)</div></td>')
+       
+       def __init__(self, uri):
+               self.uri = uri
+       
+       def getDiff(self, revId, retries=10):
+               attempt = 0
+               while attempt < retries:
+                       try:
+                               response = urllib2.urlopen(
+                                       self.uri,
+                                       urllib.urlencode({
+                                               'action': 'query',
+                                               'prop': 'revisions',
+                                               'revids': revId,
+                                               'rvprop': 'ids',
+                                               'rvdiffto': 'prev',
+                                               'format': 'json'
+                                       })
+                               )
+                               result = json.load(response)
+                               return 
result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
+                       except urllib2.HTTPError as e:
+                               time.sleep(attempt*2)
+                               attempt += 1
+                               
+                       
+       
+       def getAdded(self, revId):
+               diff = self.getDiff(revId)
+               
+               return self.unescape(
+                               "\n".join(
+                               match.group(1) 
+                               for match in WPAPI.DIFF_ADD_RE.finditer(diff)
+                       )
+               )
+               
+       def unescape(self, text):
+               def fixup(m):
+                       text = m.group(0)
+                       if text[:2] == "&#":
+                               # character reference
+                               try:
+                                       if text[:3] == "&#x":
+                                               return unichr(int(text[3:-1], 
16))
+                                       else:
+                                               return unichr(int(text[2:-1]))
+                               except ValueError:
+                                       pass
+                       else:
+                               # named entity
+                               try:
+                                       text = 
unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+                               except KeyError:
+                                       pass
+                       return text # leave as is
+               return re.sub("&#?\w+;", fixup, text)
+       
+       
+if __name__ == "__main__": 
+       main()

Added: trunk/tools/wsor/message_templates/umetrics/util/__init__.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/util/__init__.py                
                (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/util/__init__.py        
2012-01-03 23:12:51 UTC (rev 107959)
@@ -0,0 +1 @@
+from .mw_api import MWAPI, MWAPIError

Added: trunk/tools/wsor/message_templates/umetrics/util/mw_api.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/util/mw_api.py                  
        (rev 0)
+++ trunk/tools/wsor/message_templates/umetrics/util/mw_api.py  2012-01-03 
23:12:51 UTC (rev 107959)
@@ -0,0 +1,52 @@
+import urllib2, urllib, json
+import time
+from cookielib import CookieJar
+
+class MWAPIError(Exception):
+       def __init__(self, code, message):
+               self.code = code
+               self.info = info
+       
+       def __repr__(self):
+               return "%s(%s)" % (
+                       self.__class__.__name__,
+                       ", ".join(
+                               repr(self.code),
+                               repr(self.info)
+                       )
+               )
+       
+       def __str__(self):
+               return "%s: %s" % (self.code, self.info)
+
+class MWAPI:
+       
+       def __init__(self, uri):
+               self.uri = uri
+               self.cookies = CookieJar()
+       
+       def request(self, retry=0, **kwargs):
+               kwargs['format'] = "json"
+               
+               request = urllib2.Request(
+                               self.uri,
+                               urllib.urlencode(kwargs)
+               )
+               self.cookies.add_cookie_header(request)
+               
+               try:
+                       response = urllib2.urlopen(request)
+               except urllib2.HTTPError:
+                       #wait and try again
+                       time.sleep(2**retry)
+                       self.request(retry=retry+1, **kwargs)
+                       
+               self.cookies.extract_cookies(response, request)
+               
+               js = json.load(response)
+               
+               if 'error' in js:
+                       raise MWAPIError(js['error']['code'], 
js['error']['info'])
+               else:
+                       return js
+                               

Deleted: trunk/tools/wsor/message_templates/user_metrics.py
===================================================================
--- trunk/tools/wsor/message_templates/user_metrics.py  2012-01-03 23:09:54 UTC 
(rev 107958)
+++ trunk/tools/wsor/message_templates/user_metrics.py  2012-01-03 23:12:51 UTC 
(rev 107959)
@@ -1,102 +0,0 @@
-import sys, argparse, os
-import logging, types
-import MySQLdb, MySQLdb.cursors
-
-from generators import GENERATORS, Metrics, EditCounts
-
-
-class MissingRevError(Exception):pass
-
-def encode(v):
-       if v == None: return "\N"
-       
-       if type(v) == types.LongType:     v = int(v)
-       elif type(v) == types.UnicodeType: v = v.encode('utf-8')
-       
-       return str(v).encode("string-escape")
-
-
-def main():
-       
-       parser = argparse.ArgumentParser(
-               description="""
-               Gathers metrics for users around a timestamp.
-               """,
-               conflict_handler="resolve"
-       )
-       parser.add_argument(
-               '-c', '--cnf',
-               metavar="<path>",
-               type=str, 
-               help='the path to MySQL config info (defaults to ~/.my.cnf)',
-               default=os.path.expanduser("~/.my.cnf")
-       )
-       parser.add_argument(
-               '-h', '--host',
-               type=str, 
-               help='the database host to connect to (defaults to localhost)',
-               default="localhost"
-       )
-       parser.add_argument(
-               '-d', '--db',
-               type=str, 
-               help='the language db to run the query in (defaults to enwiki)',
-               default="enwiki"
-       )
-       parser.add_argument(
-               '-a', '--api_uri',
-               type=str, 
-               help='the mediawiki API to connect to in order to retrieve 
message content (defaults to http://en.wikipedia.org/w/api.php)',
-               default="http://en.wikipedia.org/w/api.php";
-       )
-       parser.add_argument(
-               'generator',
-               type=lambda g: GENERATORS[g],
-               nargs="+",
-               help='the metric generators to run (%s)' % ', 
'.join(GENERATORS.keys())
-       )
-       args = parser.parse_args()
-       
-       LOGGING_STREAM = sys.stderr
-       logging.basicConfig(
-               level=logging.DEBUG,
-               stream=LOGGING_STREAM,
-               format='%(asctime)s %(levelname)-8s %(message)s',
-               datefmt='%b-%d %H:%M:%S'
-       )
-       
-       if sys.stdin.isatty():
-               logging.error("No data piped to standard in!")
-               return
-       
-       
-       logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
-       conn = MySQLdb.connect(
-               host=args.host, 
-               db=args.db, 
-               read_default_file=args.cnf,
-               cursorclass=MySQLdb.cursors.DictCursor
-       )
-       
-       logging.info("Loading generators...")
-       metrics = Metrics(g(conn, args.api_uri) for g in args.generator)
-       print("\t".join(encode(h) for h in metrics.headers()))
-       
-       
-       logging.info("Processing users...")
-       for line in sys.stdin:
-               username, timestamp = line.strip().split("\t")[0:2]
-               username = unicode(username, 'utf-8')
-               
-               logging.debug("\t%s at %s:" % (username, timestamp))
-               print("\t".join(encode(v) for v in metrics.values(username, 
timestamp)))
-               LOGGING_STREAM.write("o")
-               
-       LOGGING_STREAM.write("\n")
-       
-
-
-
-       
-if __name__ == "__main__": 
-       main()


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to