https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113492
Revision: 113492 Author: giovanni Date: 2012-03-09 18:29:43 +0000 (Fri, 09 Mar 2012) Log Message: ----------- added scripts for analyzing daily cohort edit count data Added Paths: ----------- trunk/tools/wsor/editor_lifecycle/scripts/fitcounts trunk/tools/wsor/editor_lifecycle/scripts/mksamples Added: trunk/tools/wsor/editor_lifecycle/scripts/fitcounts =================================================================== --- trunk/tools/wsor/editor_lifecycle/scripts/fitcounts (rev 0) +++ trunk/tools/wsor/editor_lifecycle/scripts/fitcounts 2012-03-09 18:29:43 UTC (rev 113492) @@ -0,0 +1,55 @@ +#!/usr/bin/python +#:vim:ft=python +# encoding:utf-8 + +''' fits daily count samples ''' + +import os +from contextlib import closing +from argparse import ArgumentParser +import numpy as np +from scipy.stats import nbinom, geom, poisson, chisquare +from scipy.optimize import fmin +import matplotlib.pyplot as pp + +parser = ArgumentParser(description=__doc__) +parser.add_argument('input_path', metavar='file') + +models = [ nbinom, ] # poisson, geom ] +initial_args = { 'nbinom' : (5,.5), 'poisson' : (10,), 'geom' : (.5,) } + +def main(args): + model_params = {} + model_pvalue = {} + with closing(open(args.input_path)) as infile: + for i, line in enumerate(infile): + sample = np.asarray(map(int, line.split())) + if len(sample) < 5: + print 'day %d: skipping rest of file' % i + break + f_obs, bins = np.histogram(sample, bins=sample.ptp() or 1) + for rv in models: + nll = lambda k : - rv(*k).logpmf(sample).sum() + beta = fmin(nll, initial_args[rv.name], disp=False) + f_exp = rv(*beta).pmf(bins[:-1]) * sample.sum() + chisq, pval = chisquare(f_obs, f_exp, rv.numargs) + try: + model_params[rv.name].append(beta) + model_pvalue[rv.name].append(pval) + except KeyError: + model_params[rv.name] = [ beta ] + model_pvalue[rv.name] = [ pval ] + print 'day %d: done' % i + for rv in models: + model_params[rv.name] = np.asarray(model_params[rv.name]) + model_pvalue[rv.name] = np.asarray(model_pvalue[rv.name]) + return model_params, model_pvalue + + +# pp.scatter(params.T[0], params.T[1], c='k', marker='.') +# pp.show() + + +if __name__ == '__main__': + args = parser.parse_args() + params, pvalues = main(args) Added: trunk/tools/wsor/editor_lifecycle/scripts/mksamples =================================================================== --- trunk/tools/wsor/editor_lifecycle/scripts/mksamples (rev 0) +++ trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:43 UTC (rev 113492) @@ -0,0 +1,47 @@ +#!/usr/bin/python +#:vim:ft=python +# encoding:utf-8 + +''' groups user counts by day since registration ''' + +import os +from argparse import ArgumentParser +import numpy as np +from scipy.sparse import coo_matrix +from collections import deque +from contextlib import closing + +parser = ArgumentParser(description=__doc__) +parser.add_argument('input_paths', metavar='file', nargs='+') +parser.add_argument('-p', '--prefix', dest='output_prefix', default='', + metavar='PREFIX') + +def main(args): + for path in args.input_paths: + output_path = args.output_prefix + os.path.basename(path) + output_path = os.path.splitext(output_path)[0] + '.tsv' + day_counts = {} + archive = np.load(path) + N = len(archive.files) + print '%d users in %s' % (N, path) + with closing(open(output_path, 'w')) as out_file: + for uid in archive.files: + data = archive[uid].view(np.recarray) + idx = data.ns >= 0 + data = data[idx] + counts = coo_matrix((data.edits, (data.day - data.day.min(), + data.ns))).tocsr().sum(axis=1) + for day in xrange(counts.shape[0]): + n = int(counts[day]) + try: + day_counts[day].append(n) + except KeyError: + day_counts[day] = deque([n]) + max_day = max(day_counts.keys()) + for day in xrange(max_day): + print >> out_file, ' '.join(map(str, day_counts.get(day, []))) + print '%s saved.' % output_path + +if __name__ == '__main__': + args = parser.parse_args() + main(args) _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs