https://www.mediawiki.org/wiki/Special:Code/MediaWiki/107959
Revision: 107959 Author: halfak Date: 2012-01-03 23:12:51 +0000 (Tue, 03 Jan 2012) Log Message: ----------- Postings and metrics script working. Metrics for editcounts, warnings, talk and blocks working, but not spot checked. Modified Paths: -------------- trunk/tools/wsor/message_templates/sql/test.sql trunk/tools/wsor/message_templates/umetrics/generators/__init__.py trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py Added Paths: ----------- trunk/tools/wsor/message_templates/metrics trunk/tools/wsor/message_templates/postings trunk/tools/wsor/message_templates/sample_postings.tsv trunk/tools/wsor/message_templates/umetrics/ trunk/tools/wsor/message_templates/umetrics/__init__.py trunk/tools/wsor/message_templates/umetrics/generators/ trunk/tools/wsor/message_templates/umetrics/generators/blocks.py trunk/tools/wsor/message_templates/umetrics/generators/talk.py trunk/tools/wsor/message_templates/umetrics/generators/warnings.py trunk/tools/wsor/message_templates/umetrics/metrics.py trunk/tools/wsor/message_templates/umetrics/postings.py trunk/tools/wsor/message_templates/umetrics/util/ trunk/tools/wsor/message_templates/umetrics/util/__init__.py trunk/tools/wsor/message_templates/umetrics/util/mw_api.py Removed Paths: ------------- trunk/tools/wsor/message_templates/generators/ trunk/tools/wsor/message_templates/message_postings.py trunk/tools/wsor/message_templates/user_metrics.py Deleted: trunk/tools/wsor/message_templates/message_postings.py =================================================================== --- trunk/tools/wsor/message_templates/message_postings.py 2012-01-03 23:09:54 UTC (rev 107958) +++ trunk/tools/wsor/message_templates/message_postings.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -1,282 +0,0 @@ -''' -This script connects to a mediawiki database and API to collect User_talk revisions -that match a set of patterns (and optionally, username). - -:Parameters: - Access the script's documentation for a parameter listing. - - % python message_postings.py --help - -:Output: - This script writes a set of escaped, tab separated columns to standard out. - - Recipient name - The name of the user who received the posting - - Timestamp - The time at which the posting was made - - Revision ID - The identifier of the revision matching the posting - - Poster ID - The identifier of the user who made the posting - - Poster name - The name of the user who make the posting - - Message match - The portion of the message posting that was matched by the regular expression. - -:Example: - python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1" -''' -import sys, argparse, os -import logging, types, re -import time, datetime -import MySQLdb, MySQLdb.cursors -import urllib, urllib2, json, htmlentitydefs -import wmf - -class MissingRevError(Exception):pass - -def encode(v): - if v == None: return "\N" - - if type(v) == types.LongType: v = int(v) - elif type(v) == types.UnicodeType: v = v.encode('utf-8') - - return str(v).encode("string-escape") - -def emit(rev): - - print( - "\t".join( - encode(rev[c]) for c in [ - 'recipient_name', - 'rev_timestamp', - 'rev_id', - 'poster_id', - 'poster_name', - 'message_match' - ] - ) - ) - - -# MediaWiki Date format -# -# | year | month | day | hour | minute | second | -MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$") -def mwDate(string): - if MW_DATE.match(string) == None: - raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string) - else: - return string - -def main(): - parser = argparse.ArgumentParser( - description=""" - Gathers experimental message postings from user_talk messages. - """, - epilog=""" - python message_postings.py - -h db42 - --start=20111222000000 - --end=20111223000000 - --comment="\(\[\[WP:HG\|HG\]\]\)" - --message="Template:uw-vandalism1" - """, - conflict_handler="resolve" - ) - parser.add_argument( - '-c', '--cnf', - metavar="<path>", - type=str, - help='the path to MySQL config info (defaults to ~/.my.cnf)', - default=os.path.expanduser("~/.my.cnf") - ) - parser.add_argument( - '-h', '--host', - type=str, - help='the database host to connect to (defaults to localhost)', - default="localhost" - ) - parser.add_argument( - '-d', '--db', - type=str, - help='the language db to run the query in (defaults to enwiki)', - default="enwiki" - ) - parser.add_argument( - '-a', '--api_uri', - type=str, - help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', - default="http://en.wikipedia.org/w/api.php" - ) - parser.add_argument( - '--start', - type=mwDate, - help='the start of the experimental period. (Required)', - required=True - ) - parser.add_argument( - '--end', - type=mwDate, - help='the end of the experimental period. (defaults to NOW())', - default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") - ) - parser.add_argument( - '--user_name', - type=str, - help='the user_name to further filter postings by (useful for tracking bots)' - ) - parser.add_argument( - '--comment', - type=re.compile, - help='regular expression to match against message posting comment' - ) - parser.add_argument( - '--message', - type=re.compile, - help='regular expression to match against message content (required)', - required=True - ) - args = parser.parse_args() - - LOGGING_STREAM = sys.stderr - logging.basicConfig( - level=logging.DEBUG, - stream=LOGGING_STREAM, - format='%(asctime)s %(levelname)-8s %(message)s', - datefmt='%b-%d %H:%M:%S' - ) - logging.debug("Comment pattern is %r." % args.comment.pattern) - logging.debug("Message pattern is %r." % args.message.pattern) - - logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) - db = Database( - host=args.host, - db=args.db, - read_default_file=args.cnf - ) - - logging.info("Connecting to API @ %s." % args.api_uri) - api = WPAPI(args.api_uri) - - logging.info("Querying for matching revisions:") - count = {"matched": 0, "missed": 0} - for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): - message = api.getAdded(rev['rev_id']) - match = args.message.search(message) - if match != None: - rev['message_match'] = match.group(0) - - emit(rev) - LOGGING_STREAM.write("|") - count['matched'] += 1 - else: - LOGGING_STREAM.write("o") - count['missed'] += 1 - - LOGGING_STREAM.write("\n") - logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count) - - - -class Database: - - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - self.conn = MySQLdb.connect(*args, **kwargs) - - def getPostings(self, start, end, userName=None, commentRE=None): - if (userName, commentRE) == (None, None): - raise TypeError("Must specify at at least one of userName or commentRE.") - - cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) - query = """ - SELECT - r.rev_id, - r.rev_timestamp, - r.rev_comment, - r.rev_user AS poster_id, - r.rev_user_text AS poster_name, - REPLACE(p.page_title, "_", " ") AS recipient_name - FROM revision r - INNER JOIN page p ON r.rev_page = p.page_id - WHERE rev_timestamp BETWEEN %(start)s AND %(end)s - AND page_namespace = 3 - """ - if userName != None: - query += "AND rev_user_text = %(user_name)s\n" - if commentRE != None: - query += "AND rev_comment REGEXP %(comment_pattern)s\n" - - cursor.execute( - query, - { - 'start': start, - 'end': end, - 'user_name': userName, - 'comment_pattern': commentRE.pattern - } - ) - - for row in cursor: - yield row - - - -class WPAPI: - DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') - - def __init__(self, uri): - self.uri = uri - - def getDiff(self, revId, retries=10): - attempt = 0 - while attempt < retries: - try: - response = urllib2.urlopen( - self.uri, - urllib.urlencode({ - 'action': 'query', - 'prop': 'revisions', - 'revids': revId, - 'rvprop': 'ids', - 'rvdiffto': 'prev', - 'format': 'json' - }) - ) - result = json.load(response) - return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] - except urllib2.HTTPError as e: - time.sleep(attempt*2) - attempt += 1 - - - - def getAdded(self, revId): - diff = self.getDiff(revId) - - return self.unescape( - "\n".join( - match.group(1) - for match in WPAPI.DIFF_ADD_RE.finditer(diff) - ) - ) - - def unescape(self, text): - def fixup(m): - text = m.group(0) - if text[:2] == "&#": - # character reference - try: - if text[:3] == "&#x": - return unichr(int(text[3:-1], 16)) - else: - return unichr(int(text[2:-1])) - except ValueError: - pass - else: - # named entity - try: - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) - except KeyError: - pass - return text # leave as is - return re.sub("&#?\w+;", fixup, text) - - -if __name__ == "__main__": - main() Added: trunk/tools/wsor/message_templates/metrics =================================================================== --- trunk/tools/wsor/message_templates/metrics (rev 0) +++ trunk/tools/wsor/message_templates/metrics 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,2 @@ +#!/usr/bin/env python +from umetrics.metrics import main;main() Property changes on: trunk/tools/wsor/message_templates/metrics ___________________________________________________________________ Added: svn:executable + * Added: trunk/tools/wsor/message_templates/postings =================================================================== --- trunk/tools/wsor/message_templates/postings (rev 0) +++ trunk/tools/wsor/message_templates/postings 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,2 @@ +#!/usr/bin/env python +from umetrics.postings import main;main() Property changes on: trunk/tools/wsor/message_templates/postings ___________________________________________________________________ Added: svn:executable + * Added: trunk/tools/wsor/message_templates/sample_postings.tsv =================================================================== --- trunk/tools/wsor/message_templates/sample_postings.tsv (rev 0) +++ trunk/tools/wsor/message_templates/sample_postings.tsv 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,122 @@ +99.92.179.246 20111222000623 467110760 205121 Koavf Template:uw-vandalism1 +206.51.176.121 20111222002717 467113472 7695475 Jim1138 Template:uw-vandalism1 +96.255.37.252 20111222002944 467113720 7695475 Jim1138 Template:uw-vandalism1 +88.207.182.230 20111222003427 467114243 7695475 Jim1138 Template:uw-vandalism1 +75.28.52.111 20111222004219 467115072 8371165 AbigailAbernathy Template:uw-vandalism1 +69.142.218.3 20111222004409 467115300 7695475 Jim1138 Template:uw-vandalism1 +65.96.250.23 20111222004727 467115720 7695475 Jim1138 Template:uw-vandalism1 +69.158.17.157 20111222004835 467115846 7695475 Jim1138 Template:uw-vandalism1 +99.226.152.51 20111222005322 467116380 8371165 AbigailAbernathy Template:uw-vandalism1 +50.90.114.25 20111222005336 467116401 8371165 AbigailAbernathy Template:uw-vandalism1 +24.218.185.51 20111222010055 467117261 8371165 AbigailAbernathy Template:uw-vandalism1 +99.60.53.154 20111222012001 467119343 8371165 AbigailAbernathy Template:uw-vandalism1 +174.57.17.84 20111222012338 467119703 8371165 AbigailAbernathy Template:uw-vandalism1 +PMPC 20111222012759 467120150 7695475 Jim1138 Template:uw-vandalism1 +109.77.113.235 20111222012830 467120200 7695475 Jim1138 Template:uw-vandalism1 +108.130.64.2 20111222012855 467120243 7695475 Jim1138 Template:uw-vandalism1 +173.74.248.50 20111222013909 467121294 7695475 Jim1138 Template:uw-vandalism1 +218.24.165.201 20111222014206 467121631 7695475 Jim1138 Template:uw-vandalism1 +108.48.90.203 20111222014414 467121875 7695475 Jim1138 Template:uw-vandalism1 +183.177.191.220 20111222015126 467122559 7695475 Jim1138 Template:uw-vandalism1 +66.168.26.250 20111222023031 467125938 15020596 Mark Arsten Template:uw-vandalism1 +68.98.168.116 20111222023202 467126066 7821268 RandomAct Template:uw-vandalism1 +72.184.167.111 20111222023634 467126522 15020596 Mark Arsten Template:uw-vandalism1 +82.11.48.151 20111222023654 467126555 15020596 Mark Arsten Template:uw-vandalism1 +76.69.9.18 20111222023843 467126740 15020596 Mark Arsten Template:uw-vandalism1 +76.179.88.134 20111222024014 467126906 7821268 RandomAct Template:uw-vandalism1 +223.228.169.111 20111222024511 467127446 15020596 Mark Arsten Template:uw-vandalism1 +70.73.15.114 20111222024531 467127488 15020596 Mark Arsten Template:uw-vandalism1 +115.240.209.58 20111222024600 467127532 15020596 Mark Arsten Template:uw-vandalism1 +198.151.130.54 20111222024729 467127691 7821268 RandomAct Template:uw-vandalism1 +184.37.78.85 20111222024756 467127733 15020596 Mark Arsten Template:uw-vandalism1 +112.210.239.1 20111222025209 467128144 15020596 Mark Arsten Template:uw-vandalism1 +71.227.110.218 20111222025853 467128908 7821268 RandomAct Template:uw-vandalism1 +70.127.78.61 20111222030015 467129038 15020596 Mark Arsten Template:uw-vandalism1 +50.9.34.247 20111222030027 467129069 15020596 Mark Arsten Template:uw-vandalism1 +68.1.183.212 20111222030559 467129619 15020596 Mark Arsten Template:uw-vandalism1 +67.128.239.58 20111222030620 467129655 15020596 Mark Arsten Template:uw-vandalism1 +Carenblake 20111222030640 467129689 15020596 Mark Arsten Template:uw-vandalism1 +2.101.130.254 20111222030856 467129927 15020596 Mark Arsten Template:uw-vandalism1 +68.68.187.69 20111222031254 467130311 7695475 Jim1138 Template:uw-vandalism1 +75.86.201.236 20111222031313 467130337 15020596 Mark Arsten Template:uw-vandalism1 +74.132.43.218 20111222031420 467130443 15020596 Mark Arsten Template:uw-vandalism1 +142.68.160.131 20111222031625 467130652 15020596 Mark Arsten Template:uw-vandalism1 +Blueturtle2 20111222031752 467130815 7695475 Jim1138 Template:uw-vandalism1 +69.153.186.30 20111222032159 467131245 15020596 Mark Arsten Template:uw-vandalism1 +24.191.10.180 20111222032237 467131326 15020596 Mark Arsten Template:uw-vandalism1 +89.124.240.80 20111222032924 467132037 15020596 Mark Arsten Template:uw-vandalism1 +76.202.230.190 20111222033021 467132144 7695475 Jim1138 Template:uw-vandalism1 +98.154.111.227 20111222033157 467132313 15020596 Mark Arsten Template:uw-vandalism1 +99.119.25.29 20111222033215 467132346 15020596 Mark Arsten Template:uw-vandalism1 +72.67.11.201 20111222033240 467132395 15020596 Mark Arsten Template:uw-vandalism1 +108.84.217.178 20111222033339 467132499 15020596 Mark Arsten Template:uw-vandalism1 +98.254.245.97 20111222033347 467132515 15020596 Mark Arsten Template:uw-vandalism1 +116.68.248.117 20111222033402 467132545 7695475 Jim1138 Template:uw-vandalism1 +68.48.81.29 20111222033407 467132557 15020596 Mark Arsten Template:uw-vandalism1 +69.14.32.169 20111222033431 467132594 15020596 Mark Arsten Template:uw-vandalism1 +76.172.11.143 20111222033513 467132652 15020596 Mark Arsten Template:uw-vandalism1 +207.255.163.58 20111222033517 467132663 7695475 Jim1138 Template:uw-vandalism1 +82.37.109.26 20111222033535 467132688 15020596 Mark Arsten Template:uw-vandalism1 +24.1.86.54 20111222033637 467132794 15020596 Mark Arsten Template:uw-vandalism1 +68.229.166.67 20111222033749 467132911 15020596 Mark Arsten Template:uw-vandalism1 +68.197.139.18 20111222034204 467133304 7695475 Jim1138 Template:uw-vandalism1 +67.241.26.211 20111222034557 467133679 15020596 Mark Arsten Template:uw-vandalism1 +71.72.129.150 20111222034630 467133724 7695475 Jim1138 Template:uw-vandalism1 +99.112.124.88 20111222034726 467133802 7695475 Jim1138 Template:uw-vandalism1 +4.254.81.152 20111222034915 467133967 7695475 Jim1138 Template:uw-vandalism1 +71.191.34.185 20111222035000 467134028 15020596 Mark Arsten Template:uw-vandalism1 +108.132.160.105 20111222035215 467134272 15020596 Mark Arsten Template:uw-vandalism1 +174.113.229.186 20111222035227 467134295 15020596 Mark Arsten Template:uw-vandalism1 +86.19.242.120 20111222035300 467134348 15020596 Mark Arsten Template:uw-vandalism1 +173.176.118.87 20111222035508 467134517 15020596 Mark Arsten Template:uw-vandalism1 +114.142.166.229 20111222035601 467134589 15020596 Mark Arsten Template:uw-vandalism1 +76.25.211.198 20111222035646 467134656 7695475 Jim1138 Template:uw-vandalism1 +68.3.112.168 20111222035745 467134737 7695475 Jim1138 Template:uw-vandalism1 +67.20.133.136 20111222042426 467136925 7695475 Jim1138 Template:uw-vandalism1 +68.5.93.197 20111222042635 467137125 7695475 Jim1138 Template:uw-vandalism1 +50.135.29.111 20111222042958 467137392 7695475 Jim1138 Template:uw-vandalism1 +99.91.215.6 20111222043424 467137772 7695475 Jim1138 Template:uw-vandalism1 +Eastderp1 20111222044200 467138536 7695475 Jim1138 Template:uw-vandalism1 +173.26.101.78 20111222044530 467138890 7695475 Jim1138 Template:uw-vandalism1 +80.194.26.184 20111222050300 467140546 7695475 Jim1138 Template:uw-vandalism1 +69.230.195.135 20111222050355 467140632 7695475 Jim1138 Template:uw-vandalism1 +Violinmagician 20111222050359 467140640 7695475 Jim1138 Template:uw-vandalism1 +76.166.147.85 20111222050726 467140963 7695475 Jim1138 Template:uw-vandalism1 +72.229.150.7 20111222050921 467141171 7695475 Jim1138 Template:uw-vandalism1 +96.250.109.41 20111222051253 467141520 7695475 Jim1138 Template:uw-vandalism1 +70.249.216.200 20111222051603 467141788 7695475 Jim1138 Template:uw-vandalism1 +76.114.237.52 20111222051804 467142013 7695475 Jim1138 Template:uw-vandalism1 +99.231.38.15 20111222052155 467142435 7695475 Jim1138 Template:uw-vandalism1 +70.232.36.209 20111222052236 467142502 58193 OverlordQ Template:uw-vandalism1 +70.171.84.236 20111222055646 467145424 7695475 Jim1138 Template:uw-vandalism1 +117.229.124.35 20111222060552 467146249 7695475 Jim1138 Template:uw-vandalism1 +92.8.86.211 20111222061442 467147022 7695475 Jim1138 Template:uw-vandalism1 +Jonnygharris 20111222063512 467148666 2359527 Tgeairn Template:uw-vandalism1 +68.32.26.30 20111222064442 467149456 7695475 Jim1138 Template:uw-vandalism1 +122.172.47.49 20111222064608 467149587 7695475 Jim1138 Template:uw-vandalism1 +68.67.110.139 20111222064632 467149618 2359527 Tgeairn Template:uw-vandalism1 +58.68.46.210 20111222070300 467150889 2359527 Tgeairn Template:uw-vandalism1 +120.56.171.37 20111222070459 467151018 7695475 Jim1138 Template:uw-vandalism1 +24.185.100.176 20111222070521 467151047 2359527 Tgeairn Template:uw-vandalism1 +94.76.32.252 20111222070626 467151122 7695475 Jim1138 Template:uw-vandalism1 +71.2.35.65 20111222070954 467151442 2359527 Tgeairn Template:uw-vandalism1 +203.52.228.196 20111222071602 467152062 7695475 Jim1138 Template:uw-vandalism1 +124.84.101.186 20111222072924 467153017 2359527 Tgeairn Template:uw-vandalism1 +74.120.224.200 20111222073655 467153546 7695475 Jim1138 Template:uw-vandalism1 +70.230.154.187 20111222074036 467153814 7695475 Jim1138 Template:uw-vandalism1 +69.226.149.86 20111222074434 467154103 7695475 Jim1138 Template:uw-vandalism1 +75.131.131.219 20111222075957 467155209 7695475 Jim1138 Template:uw-vandalism1 +49.245.133.171 20111222080029 467155275 7695475 Jim1138 Template:uw-vandalism1 +152.118.24.10 20111222080106 467155326 7695475 Jim1138 Template:uw-vandalism1 +99.224.122.205 20111222081721 467156446 7695475 Jim1138 Template:uw-vandalism1 +198.240.133.75 20111222082450 467156945 7695475 Jim1138 Template:uw-vandalism1 +110.77.227.221 20111222083046 467157456 7695475 Jim1138 Template:uw-vandalism1 +2.188.4.3 20111222083310 467157657 7695475 Jim1138 Template:uw-vandalism1 +145.103.249.34 20111222084352 467158487 7695475 Jim1138 Template:uw-vandalism1 +50.64.10.116 20111222084516 467158585 7695475 Jim1138 Template:uw-vandalism1 +76.67.17.205 20111222085448 467159343 7695475 Jim1138 Template:uw-vandalism1 +120.144.129.107 20111222092827 467162126 7695475 Jim1138 Template:uw-vandalism1 +14.139.243.229 20111222093150 467162401 7695475 Jim1138 Template:uw-vandalism1 +180.149.52.45 20111222093649 467162851 7695475 Jim1138 Template:uw-vandalism1 +24.191.25.201 20111222094616 467163617 7695475 Jim1138 Template:uw-vandalism1 +71.227.10.224 20111222095309 467164127 7695475 Jim1138 Template:uw-vandalism1 Modified: trunk/tools/wsor/message_templates/sql/test.sql =================================================================== --- trunk/tools/wsor/message_templates/sql/test.sql 2012-01-03 23:09:54 UTC (rev 107958) +++ trunk/tools/wsor/message_templates/sql/test.sql 2012-01-03 23:12:51 UTC (rev 107959) @@ -1 +1,47 @@ +( + SELECT + False as deleted, + page_namespace as ns, + count(*) as revisions + FROM enwiki.revision + INNER JOIN enwiki.page ON rev_page = page_id + WHERE rev_timestamp <= "20110101000000" + AND rev_user_text = "EpochFail" + GROUP BY page_namespace +) +UNION +( + SELECT + True as deleted, + ar_namespace as ns, + count(*) as revisions + FROM enwiki.archive + WHERE ar_timestamp <= "20110101000000" + AND ar_user_text = "EpochFail" + GROUP BY ar_namespace +) +SELECT + r.rev_id, + r.rev_timestamp, + r.rev_comment, + r.rev_user AS poster_id, + r.rev_user_text AS poster_name, + REPLACE(p.page_title, "_", " ") AS recipient_name +FROM revision r +INNER JOIN page p ON r.rev_page = p.page_id +WHERE rev_timestamp BETWEEN "20111222000000" AND "20111223000000" +AND page_namespace = 3; + + +SELECT + IF(log_params LIKE "%indefinite%", "ban", "block") as type, + IF(log_timestamp > "20110101000000", "after", "before") as tense, + count(*) as count, + min(log_timestamp) as first, + max(log_timestamp) as last +FROM logging +WHERE log_type = "block" +AND log_action = "block" +AND log_title = "EpochFail" +GROUP BY 1, 2; Added: trunk/tools/wsor/message_templates/umetrics/__init__.py =================================================================== --- trunk/tools/wsor/message_templates/umetrics/__init__.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/__init__.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1 @@ + Modified: trunk/tools/wsor/message_templates/umetrics/generators/__init__.py =================================================================== --- trunk/tools/wsor/message_templates/generators/__init__.py 2012-01-02 20:26:32 UTC (rev 107849) +++ trunk/tools/wsor/message_templates/umetrics/generators/__init__.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -1,8 +1,14 @@ from .edit_counts import EditCounts +from .talk import Talk +from .blocks import Blocks +from .warnings import Warnings from .metric_generator import MetricGenerator GENERATORS = { - 'editcounts': EditCounts + 'editcounts': EditCounts, + 'talk': Talk, + 'blocks': Blocks, + 'warnings': Warnings } class Metrics(MetricGenerator): Added: trunk/tools/wsor/message_templates/umetrics/generators/blocks.py =================================================================== --- trunk/tools/wsor/message_templates/umetrics/generators/blocks.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/generators/blocks.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,57 @@ +import itertools +from .metric_generator import MetricGenerator + +class Blocks(MetricGenerator): + + def __init__(self, conn, api): + self.conn = conn + + def headers(self): + return [ + 'blocks_before', + 'blocks_after', + 'first_block_before', + 'last_block_before', + 'first_block_after', + 'last_block_after', + 'bans_before', + 'bans_after', + 'first_ban_before', + 'last_ban_before', + 'first_ban_after', + 'last_ban_after' + ] + + def values(self, username, timestamp): + rowValues = {} + + cursor = self.conn.cursor() + cursor.execute(""" + SELECT + IF(log_params LIKE "%%indefinite%%", "ban", "block") as type, + IF(log_timestamp > %(timestamp)s, "after", "before") as whense, + count(*) as count, + min(log_timestamp) as first, + max(log_timestamp) as last + FROM logging + WHERE log_type = "block" + AND log_action = "block" + AND log_title = %(username)s + GROUP BY 1, 2 + """, + { + 'timestamp': timestamp, + 'username': username.encode('utf-8').replace(" ", "_") + } + ) + for row in cursor: + rowValues['%(type)ss_%(whense)s' % row] = row['count'] + rowValues['first_%(type)s_%(whense)s' % row] = row['first'] + rowValues['last_%(type)s_%(whense)s' % row] = row['last'] + + rowValues['blocks_before'] = rowValues.get('blocks_before', 0) + rowValues['blocks_after'] = rowValues.get('blocks_after', 0) + rowValues['bans_before'] = rowValues.get('bans_before', 0) + rowValues['bans_after'] = rowValues.get('bans_after', 0) + + return [rowValues.get(c) for c in self.headers()] Modified: trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py =================================================================== --- trunk/tools/wsor/message_templates/generators/edit_counts.py 2012-01-02 20:26:32 UTC (rev 107849) +++ trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -3,16 +3,16 @@ class EditCounts(MetricGenerator): - def __init__(self, conn, api_uri): + def __init__(self, conn, api): self.conn = conn def headers(self): return itertools.chain(*[ [ - 'ns_%s_before_revisions_deleted' % ns, - 'ns_%s_after_revisions_deleted' % ns, - 'ns_%s_before_revisions_not_deleted' % ns, - 'ns_%s_after_revisions_not_deleted' % ns + 'ns_%s_revisions_deleted_before' % ns, + 'ns_%s_revisions_deleted_after' % ns, + 'ns_%s_revisions_not_deleted_before' % ns, + 'ns_%s_revisions_not_deleted_after' % ns ] for ns in itertools.chain(range(0,16), [100, 101, 108, 109]) ]) Added: trunk/tools/wsor/message_templates/umetrics/generators/talk.py =================================================================== --- trunk/tools/wsor/message_templates/umetrics/generators/talk.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/generators/talk.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,51 @@ +import itertools +from .metric_generator import MetricGenerator + +class Talk(MetricGenerator): + + def __init__(self, conn, api): + self.conn = conn + + def headers(self): + return [ + 'other_talk_before', + 'first_other_talk_before', + 'last_other_talk_before', + 'other_talk_after', + 'first_other_talk_after', + 'last_other_talk_after', + ] + + def values(self, username, timestamp): + rowValues = {} + + cursor = self.conn.cursor() + cursor.execute(""" + SELECT + IF(rev_timestamp > %(timestamp)s, "after", "before") as whense, + COUNT(*) as count, + MAX(rev_timestamp) as last, + MIN(rev_timestamp) as first + FROM revision + INNER JOIN page ON rev_page = page_id + WHERE page_namespace = 3 + AND rev_timestamp != %(timestamp)s + AND page_title = %(page_title)s + AND rev_user_text != %(username)s + GROUP BY 1 + """, + { + 'timestamp': timestamp, + 'page_title': username.encode('utf-8').replace(" ", "_"), + 'username': username.encode('utf-8') + } + ) + for row in cursor: + rowValues['other_talk_%(whence)s'] = row['count'] + rowValues['first_other_talk_%(whence)s'] = row['first'] + rowValues['last_other_talk_%(whence)s'] = row['last'] + + rowValues['other_talk_before'] = rowValues.get('other_talk_before', 0) + rowValues['other_talk_after'] = rowValues.get('other_talk_after', 0) + + return [rowValues.get(c) for c in self.headers()] Added: trunk/tools/wsor/message_templates/umetrics/generators/warnings.py =================================================================== --- trunk/tools/wsor/message_templates/umetrics/generators/warnings.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/generators/warnings.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,97 @@ +import itertools, wmf, difflib, re +from .metric_generator import MetricGenerator + + +class Warnings(MetricGenerator): + + WARN_RE = re.compile(r'<!--\s*Template:uw-') + + def __init__(self, conn, api): + self.conn = conn + self.api = api + + def headers(self): + return [ + 'warns_before', + 'warns_after', + 'first_warn_before', + 'last_warn_before', + 'first_warn_after', + 'last_warn_after' + ] + + def values(self, username, timestamp): + rowValues = { + 'warns_before': 0, + 'warns_after': 0 + } + + timestamp = wmf.wp2Timestamp(timestamp) + + for rev in self.getProcessedRevs(username): + #determine if we have a warning + if self.WARN_RE.search(rev['added']) != None: + if rev['timestamp'] < timestamp: + whence = "before" + elif rev['timestamp'] > timestamp: + whence = "after" + else: + continue + + rowValues['warns_%s' % whence] += 1 + + if 'first_warn_%s' % whence not in rowValues: + rowValues['first_warn_%s' % whence] = wmf.timestamp2WP(rev['timestamp']) + + rowValues['last_warn_%s' % whence] = wmf.timestamp2WP(rev['timestamp']) + + return [rowValues.get(c) for c in self.headers()] + + def getProcessedRevs(self, username): + return self.processRevs(self.getUserPageRevisions(username)) + + def getUserPageRevisions(self, username, rvcontinue=None): + js = self.api.request( + action="query", + prop="revisions", + titles="User_talk:%s" % username, + rvprop="ids|timestamp|content", + rvdir="newer", + rvlimit=50, + rvcontinue=rvcontinue + ) + + for rev in js['query']['pages'].values()[0]['revisions']: + rev['timestamp'] + yield rev + + if 'query-continue' in js: + for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']['rvstartid']): + yield rev + + + + def processRevs(self, revs): + + previousLines = [] + for rev in revs: + lines = rev.get('*', "").split("\n") + del rev['*'] + + added = [] + sm = difflib.SequenceMatcher(None, previousLines, lines) + for tag, i1, i2, j1, j2 in sm.get_opcodes(): + if tag == "insert": + added.extend(lines[j1:j2]) + elif tag == "replace": + added.extend(lines[j1:j2]) + + + rev['added'] = "\n".join(added) + rev['timestamp'] = wmf.wp2Timestamp(rev['timestamp']) + yield rev + previousLines = lines + + + + Copied: trunk/tools/wsor/message_templates/umetrics/metrics.py (from rev 107849, trunk/tools/wsor/message_templates/user_metrics.py) =================================================================== --- trunk/tools/wsor/message_templates/umetrics/metrics.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/metrics.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,100 @@ +import sys, argparse, os +import logging, types +import MySQLdb, MySQLdb.cursors + +from .generators import GENERATORS, Metrics +from .util import MWAPI, MWAPIError + +def encode(v): + if v == None: return "\N" + + if type(v) == types.LongType: v = int(v) + elif type(v) == types.UnicodeType: v = v.encode('utf-8') + + return str(v).encode("string-escape") + + +def main(): + + parser = argparse.ArgumentParser( + description=""" + Gathers metrics for users around a timestamp. + """, + conflict_handler="resolve" + ) + parser.add_argument( + '-c', '--cnf', + metavar="<path>", + type=str, + help='the path to MySQL config info (defaults to ~/.my.cnf)', + default=os.path.expanduser("~/.my.cnf") + ) + parser.add_argument( + '-h', '--host', + type=str, + help='the database host to connect to (defaults to localhost)', + default="localhost" + ) + parser.add_argument( + '-d', '--db', + type=str, + help='the language db to run the query in (defaults to enwiki)', + default="enwiki" + ) + parser.add_argument( + '-a', '--api', + type=MWAPI, + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', + default="http://en.wikipedia.org/w/api.php" + ) + parser.add_argument( + 'generator', + type=lambda g: GENERATORS[g], + nargs="+", + help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys()) + ) + args = parser.parse_args() + + LOGGING_STREAM = sys.stderr + logging.basicConfig( + level=logging.DEBUG, + stream=LOGGING_STREAM, + format='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%b-%d %H:%M:%S' + ) + + if sys.stdin.isatty(): + logging.error("No data piped to standard in!") + return + + + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) + conn = MySQLdb.connect( + host=args.host, + db=args.db, + read_default_file=args.cnf, + cursorclass=MySQLdb.cursors.DictCursor + ) + + logging.info("Loading generators...") + metrics = Metrics(g(conn, args.api) for g in args.generator) + print("\t".join(encode(h) for h in metrics.headers())) + + + logging.info("Processing users...") + for line in sys.stdin: + username, timestamp = line.strip().split("\t")[0:2] + username = unicode(username, 'utf-8') + + logging.debug("\t%s at %s:" % (username, timestamp)) + print("\t".join(encode(v) for v in metrics.values(username, timestamp))) + LOGGING_STREAM.write("o") + + LOGGING_STREAM.write("\n") + + + + + +if __name__ == "__main__": + main() Copied: trunk/tools/wsor/message_templates/umetrics/postings.py (from rev 107849, trunk/tools/wsor/message_templates/message_postings.py) =================================================================== --- trunk/tools/wsor/message_templates/umetrics/postings.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/postings.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,282 @@ +''' +This script connects to a mediawiki database and API to collect User_talk revisions +that match a set of patterns (and optionally, username). + +:Parameters: + Access the script's documentation for a parameter listing. + + % python message_postings.py --help + +:Output: + This script writes a set of escaped, tab separated columns to standard out. + - Recipient name - The name of the user who received the posting + - Timestamp - The time at which the posting was made + - Revision ID - The identifier of the revision matching the posting + - Poster ID - The identifier of the user who made the posting + - Poster name - The name of the user who make the posting + - Message match - The portion of the message posting that was matched by the regular expression. + +:Example: + python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1" +''' +import sys, argparse, os +import logging, types, re +import time, datetime +import MySQLdb, MySQLdb.cursors +import urllib, urllib2, json, htmlentitydefs +import wmf + +class MissingRevError(Exception):pass + +def encode(v): + if v == None: return "\N" + + if type(v) == types.LongType: v = int(v) + elif type(v) == types.UnicodeType: v = v.encode('utf-8') + + return str(v).encode("string-escape") + +def emit(rev): + + print( + "\t".join( + encode(rev[c]) for c in [ + 'recipient_name', + 'rev_timestamp', + 'rev_id', + 'poster_id', + 'poster_name', + 'message_match' + ] + ) + ) + + +# MediaWiki Date format +# +# | year | month | day | hour | minute | second | +MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$") +def mwDate(string): + if MW_DATE.match(string) == None: + raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string) + else: + return string + +def main(): + parser = argparse.ArgumentParser( + description=""" + Gathers experimental message postings from user_talk messages. + """, + epilog=""" + python message_postings.py + -h db42 + --start=20111222000000 + --end=20111223000000 + --comment="\(\[\[WP:HG\|HG\]\]\)" + --message="Template:uw-vandalism1" + """, + conflict_handler="resolve" + ) + parser.add_argument( + '-c', '--cnf', + metavar="<path>", + type=str, + help='the path to MySQL config info (defaults to ~/.my.cnf)', + default=os.path.expanduser("~/.my.cnf") + ) + parser.add_argument( + '-h', '--host', + type=str, + help='the database host to connect to (defaults to localhost)', + default="localhost" + ) + parser.add_argument( + '-d', '--db', + type=str, + help='the language db to run the query in (defaults to enwiki)', + default="enwiki" + ) + parser.add_argument( + '-a', '--api_uri', + type=str, + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', + default="http://en.wikipedia.org/w/api.php" + ) + parser.add_argument( + '--start', + type=mwDate, + help='the start of the experimental period. (Required)', + required=True + ) + parser.add_argument( + '--end', + type=mwDate, + help='the end of the experimental period. (defaults to NOW())', + default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") + ) + parser.add_argument( + '--user_name', + type=str, + help='the user_name to further filter postings by (useful for tracking bots)' + ) + parser.add_argument( + '--comment', + type=re.compile, + help='regular expression to match against message posting comment' + ) + parser.add_argument( + '--message', + type=re.compile, + help='regular expression to match against message content (required)', + required=True + ) + args = parser.parse_args() + + LOGGING_STREAM = sys.stderr + logging.basicConfig( + level=logging.DEBUG, + stream=LOGGING_STREAM, + format='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%b-%d %H:%M:%S' + ) + logging.debug("Comment pattern is %r." % args.comment.pattern) + logging.debug("Message pattern is %r." % args.message.pattern) + + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) + db = Database( + host=args.host, + db=args.db, + read_default_file=args.cnf + ) + + logging.info("Connecting to API @ %s." % args.api_uri) + api = WPAPI(args.api_uri) + + logging.info("Querying for matching revisions:") + count = {"matched": 0, "missed": 0} + for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): + message = api.getAdded(rev['rev_id']) + match = args.message.search(message) + if match != None: + rev['message_match'] = match.group(0) + + emit(rev) + LOGGING_STREAM.write("|") + count['matched'] += 1 + else: + LOGGING_STREAM.write("o") + count['missed'] += 1 + + LOGGING_STREAM.write("\n") + logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count) + + + +class Database: + + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + self.conn = MySQLdb.connect(*args, **kwargs) + + def getPostings(self, start, end, userName=None, commentRE=None): + if (userName, commentRE) == (None, None): + raise TypeError("Must specify at at least one of userName or commentRE.") + + cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) + query = """ + SELECT + r.rev_id, + r.rev_timestamp, + r.rev_comment, + r.rev_user AS poster_id, + r.rev_user_text AS poster_name, + REPLACE(p.page_title, "_", " ") AS recipient_name + FROM revision r + INNER JOIN page p ON r.rev_page = p.page_id + WHERE rev_timestamp BETWEEN %(start)s AND %(end)s + AND page_namespace = 3 + """ + if userName != None: + query += "AND rev_user_text = %(user_name)s\n" + if commentRE != None: + query += "AND rev_comment REGEXP %(comment_pattern)s\n" + + cursor.execute( + query, + { + 'start': start, + 'end': end, + 'user_name': userName, + 'comment_pattern': commentRE.pattern + } + ) + + for row in cursor: + yield row + + + +class WPAPI: + DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') + + def __init__(self, uri): + self.uri = uri + + def getDiff(self, revId, retries=10): + attempt = 0 + while attempt < retries: + try: + response = urllib2.urlopen( + self.uri, + urllib.urlencode({ + 'action': 'query', + 'prop': 'revisions', + 'revids': revId, + 'rvprop': 'ids', + 'rvdiffto': 'prev', + 'format': 'json' + }) + ) + result = json.load(response) + return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] + except urllib2.HTTPError as e: + time.sleep(attempt*2) + attempt += 1 + + + + def getAdded(self, revId): + diff = self.getDiff(revId) + + return self.unescape( + "\n".join( + match.group(1) + for match in WPAPI.DIFF_ADD_RE.finditer(diff) + ) + ) + + def unescape(self, text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + + +if __name__ == "__main__": + main() Added: trunk/tools/wsor/message_templates/umetrics/util/__init__.py =================================================================== --- trunk/tools/wsor/message_templates/umetrics/util/__init__.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/util/__init__.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1 @@ +from .mw_api import MWAPI, MWAPIError Added: trunk/tools/wsor/message_templates/umetrics/util/mw_api.py =================================================================== --- trunk/tools/wsor/message_templates/umetrics/util/mw_api.py (rev 0) +++ trunk/tools/wsor/message_templates/umetrics/util/mw_api.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -0,0 +1,52 @@ +import urllib2, urllib, json +import time +from cookielib import CookieJar + +class MWAPIError(Exception): + def __init__(self, code, message): + self.code = code + self.info = info + + def __repr__(self): + return "%s(%s)" % ( + self.__class__.__name__, + ", ".join( + repr(self.code), + repr(self.info) + ) + ) + + def __str__(self): + return "%s: %s" % (self.code, self.info) + +class MWAPI: + + def __init__(self, uri): + self.uri = uri + self.cookies = CookieJar() + + def request(self, retry=0, **kwargs): + kwargs['format'] = "json" + + request = urllib2.Request( + self.uri, + urllib.urlencode(kwargs) + ) + self.cookies.add_cookie_header(request) + + try: + response = urllib2.urlopen(request) + except urllib2.HTTPError: + #wait and try again + time.sleep(2**retry) + self.request(retry=retry+1, **kwargs) + + self.cookies.extract_cookies(response, request) + + js = json.load(response) + + if 'error' in js: + raise MWAPIError(js['error']['code'], js['error']['info']) + else: + return js + Deleted: trunk/tools/wsor/message_templates/user_metrics.py =================================================================== --- trunk/tools/wsor/message_templates/user_metrics.py 2012-01-03 23:09:54 UTC (rev 107958) +++ trunk/tools/wsor/message_templates/user_metrics.py 2012-01-03 23:12:51 UTC (rev 107959) @@ -1,102 +0,0 @@ -import sys, argparse, os -import logging, types -import MySQLdb, MySQLdb.cursors - -from generators import GENERATORS, Metrics, EditCounts - - -class MissingRevError(Exception):pass - -def encode(v): - if v == None: return "\N" - - if type(v) == types.LongType: v = int(v) - elif type(v) == types.UnicodeType: v = v.encode('utf-8') - - return str(v).encode("string-escape") - - -def main(): - - parser = argparse.ArgumentParser( - description=""" - Gathers metrics for users around a timestamp. - """, - conflict_handler="resolve" - ) - parser.add_argument( - '-c', '--cnf', - metavar="<path>", - type=str, - help='the path to MySQL config info (defaults to ~/.my.cnf)', - default=os.path.expanduser("~/.my.cnf") - ) - parser.add_argument( - '-h', '--host', - type=str, - help='the database host to connect to (defaults to localhost)', - default="localhost" - ) - parser.add_argument( - '-d', '--db', - type=str, - help='the language db to run the query in (defaults to enwiki)', - default="enwiki" - ) - parser.add_argument( - '-a', '--api_uri', - type=str, - help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', - default="http://en.wikipedia.org/w/api.php" - ) - parser.add_argument( - 'generator', - type=lambda g: GENERATORS[g], - nargs="+", - help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys()) - ) - args = parser.parse_args() - - LOGGING_STREAM = sys.stderr - logging.basicConfig( - level=logging.DEBUG, - stream=LOGGING_STREAM, - format='%(asctime)s %(levelname)-8s %(message)s', - datefmt='%b-%d %H:%M:%S' - ) - - if sys.stdin.isatty(): - logging.error("No data piped to standard in!") - return - - - logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) - conn = MySQLdb.connect( - host=args.host, - db=args.db, - read_default_file=args.cnf, - cursorclass=MySQLdb.cursors.DictCursor - ) - - logging.info("Loading generators...") - metrics = Metrics(g(conn, args.api_uri) for g in args.generator) - print("\t".join(encode(h) for h in metrics.headers())) - - - logging.info("Processing users...") - for line in sys.stdin: - username, timestamp = line.strip().split("\t")[0:2] - username = unicode(username, 'utf-8') - - logging.debug("\t%s at %s:" % (username, timestamp)) - print("\t".join(encode(v) for v in metrics.values(username, timestamp))) - LOGGING_STREAM.write("o") - - LOGGING_STREAM.write("\n") - - - - - -if __name__ == "__main__": - main() _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs