https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113492

Revision: 113492
Author:   giovanni
Date:     2012-03-09 18:29:43 +0000 (Fri, 09 Mar 2012)
Log Message:
-----------
added scripts for analyzing daily cohort edit count data

Added Paths:
-----------
    trunk/tools/wsor/editor_lifecycle/scripts/fitcounts
    trunk/tools/wsor/editor_lifecycle/scripts/mksamples

Added: trunk/tools/wsor/editor_lifecycle/scripts/fitcounts
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fitcounts                         
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fitcounts 2012-03-09 18:29:43 UTC 
(rev 113492)
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+#:vim:ft=python
+# encoding:utf-8
+
+''' fits daily count samples '''
+
+import os
+from contextlib import closing
+from argparse import ArgumentParser
+import numpy as np
+from scipy.stats import nbinom, geom, poisson, chisquare
+from scipy.optimize import fmin
+import matplotlib.pyplot as pp
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_path', metavar='file')
+
+models = [ nbinom, ] # poisson, geom ]
+initial_args = { 'nbinom' : (5,.5), 'poisson' : (10,), 'geom' : (.5,) }
+
+def main(args):
+    model_params = {}
+    model_pvalue = {}
+    with closing(open(args.input_path)) as infile:
+        for i, line in enumerate(infile):
+            sample = np.asarray(map(int, line.split()))
+            if len(sample) < 5:
+                print 'day %d: skipping rest of file' % i
+                break
+            f_obs, bins = np.histogram(sample, bins=sample.ptp() or 1)
+            for rv in models:
+                nll = lambda k : - rv(*k).logpmf(sample).sum()
+                beta = fmin(nll, initial_args[rv.name], disp=False)
+                f_exp = rv(*beta).pmf(bins[:-1]) * sample.sum()
+                chisq, pval = chisquare(f_obs, f_exp, rv.numargs)
+                try:
+                    model_params[rv.name].append(beta)
+                    model_pvalue[rv.name].append(pval)
+                except KeyError:
+                    model_params[rv.name] = [ beta ] 
+                    model_pvalue[rv.name] = [ pval ]
+            print 'day %d: done' % i
+    for rv in models:
+        model_params[rv.name] = np.asarray(model_params[rv.name])
+        model_pvalue[rv.name] = np.asarray(model_pvalue[rv.name])
+    return model_params, model_pvalue
+
+
+#        pp.scatter(params.T[0], params.T[1], c='k', marker='.')
+#        pp.show()
+        
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    params, pvalues = main(args)

Added: trunk/tools/wsor/editor_lifecycle/scripts/mksamples
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/mksamples                         
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:43 UTC 
(rev 113492)
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+#:vim:ft=python
+# encoding:utf-8
+
+''' groups user counts by day since registration '''
+
+import os
+from argparse import ArgumentParser
+import numpy as np
+from scipy.sparse import coo_matrix
+from collections import deque
+from contextlib import closing
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_paths', metavar='file', nargs='+')
+parser.add_argument('-p', '--prefix', dest='output_prefix', default='',
+        metavar='PREFIX')
+
+def main(args):
+    for path in args.input_paths:
+        output_path = args.output_prefix + os.path.basename(path)
+        output_path = os.path.splitext(output_path)[0] + '.tsv'
+        day_counts = {}
+        archive = np.load(path)
+        N = len(archive.files)
+        print '%d users in %s' % (N, path)
+        with closing(open(output_path, 'w')) as out_file:
+            for uid in archive.files:
+                data = archive[uid].view(np.recarray)
+                idx = data.ns >= 0
+                data = data[idx]
+                counts = coo_matrix((data.edits, (data.day - data.day.min(),
+                    data.ns))).tocsr().sum(axis=1)
+                for day in xrange(counts.shape[0]):
+                    n = int(counts[day])
+                    try:
+                        day_counts[day].append(n)
+                    except KeyError:
+                        day_counts[day] = deque([n])
+            max_day = max(day_counts.keys())
+            for day in xrange(max_day):
+                print >> out_file, ' '.join(map(str, day_counts.get(day, [])))
+        print '%s saved.' % output_path
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    main(args)


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to