http://www.mediawiki.org/wiki/Special:Code/MediaWiki/76845
Revision: 76845 Author: diederik Date: 2010-11-16 23:07:21 +0000 (Tue, 16 Nov 2010) Log Message: ----------- This directory contains the final scripts to generate charts. Added Paths: ----------- trunk/tools/editor_trends/analyses/ trunk/tools/editor_trends/analyses/__init__.py trunk/tools/editor_trends/analyses/cohort_charts.py trunk/tools/editor_trends/analyses/file_size_reduction.py Property changes on: trunk/tools/editor_trends/analyses/__init__.py ___________________________________________________________________ Added: svn:eol-style + native Added: trunk/tools/editor_trends/analyses/cohort_charts.py =================================================================== --- trunk/tools/editor_trends/analyses/cohort_charts.py (rev 0) +++ trunk/tools/editor_trends/analyses/cohort_charts.py 2010-11-16 23:07:21 UTC (rev 76845) @@ -0,0 +1,30 @@ +__author__ = '''\n'''.join(['Diederik van Liere (dvanli...@gmail.com)', ]) +__author__email = 'dvanliere at gmail dot com' +__date__ = '2010-11-10' +__version__ = '0.1' + +import configuration +settings = configuration.Settings() +from utils import utils + +def prepare_cohort_dataset(): + dataset = utils.load_object(settings.binary_location, 'cohort_data.bin') + fh = utils.create_txt_filehandle(settings.dataset_location, 'cohort_data.txt', 'w', settings.encoding) + + years = dataset.keys() + years.sort() + periods = dataset[2001].keys() + periods.sort() + periods.remove('n') + headers = ['months_%s' % i for i in periods] + headers.insert(0, 'year') + utils.write_list_to_csv(headers, fh) + for year in years: + n = float(dataset[year].pop('n')) + obs = [100 * float(dataset[year][p]) / n for p in periods] + raw = [dataset[year][p] for p in periods] + print sum(obs) + obs.insert(0, year) + utils.write_list_to_csv(obs, fh, newline=False) + utils.write_list_to_csv(raw, fh) + fh.close() Property changes on: trunk/tools/editor_trends/analyses/cohort_charts.py ___________________________________________________________________ Added: svn:eol-style + native Added: trunk/tools/editor_trends/analyses/file_size_reduction.py =================================================================== --- trunk/tools/editor_trends/analyses/file_size_reduction.py (rev 0) +++ trunk/tools/editor_trends/analyses/file_size_reduction.py 2010-11-16 23:07:21 UTC (rev 76845) @@ -0,0 +1,99 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +''' +Copyright (C) 2010 by Diederik van Liere (dvanli...@gmail.com) +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License version 2 +as published by the Free Software Foundation. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +See the GNU General Public License for more details, at +http://www.fsf.org/licenses/gpl.html +''' + +__author__ = '''\n'''.join(['Diederik van Liere (dvanli...@gmail.com)', ]) +__author__email = 'dvanliere at gmail dot com' +__date__ = '2010-11-15' +__version__ = '0.1' + +import sys +sys.path.append('..') + +import os +import xml.etree.cElementTree as cElementTree + +import configuration +from utils import utils +settings = configuration.Settings() + + +class DumpStatistics(object): + ''' Simple class to keep track of XML tags, how often they occur, + and the length of strings they contain. This is used to calculate the + overhead. + ''' + def __init__(self): + self.tags = {} + + def add_tag(self, kwargs): + for kw in kwargs: + if kw not in self.tags: + self.tags[kw] = {} + self.tags[kw]['n'] = 0 + self.tags[kw]['size'] = 0 + self.tags[kw]['n'] += 1 + self.tags[kw]['size'] += self.determine_length(kwargs[kw]) + + def average_size_text(self): + avg = {} + for kw in self.tags: + avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n'] + return avg + + def total_size_text(self): + return sum([self.tags[kw]['size'] for kw in self.tags]) + + def total_size_xml(self): + # the x2 is for the opening and closing tag + # the +5 is for 2x <, 2x > and 1x / + return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags]) + + def determine_length(self, text): + if text == None: + return 0 + else: + return len(text) + + +def calculate_filesize_overhead(location, filename): + counter = None + ds = DumpStatistics() + context = cElementTree.iterparse(filename, events=('start', 'end')) + context = iter(context) + event, root = context.next() #get the root element of the XML doc + + try: + for event, elem in context: + if event == 'end': + ds.add_tag({elem.tag:elem.text}) + root.clear() # when done parsing a section clear the tree to release memory + except SyntaxError: + pass + utils.store_object(ds, settings.binary_location, 'ds') + xml_size = ds.total_size_xml() + text_size = ds.total_size_text() + print text_size, xml_size + print ds.tags + + +def output_dumpstatistics(): + ds = utils.load_object(settings.binary_location, 'ds.bin') + + for key in ds.tags: + print '%s\t%s' % (key, ds.tags[key]) + +if __name__ == '__main__': + output_dumpstatistics() + #calculate_filesize_overhead(settings.input_location, settings.input_filename) + Property changes on: trunk/tools/editor_trends/analyses/file_size_reduction.py ___________________________________________________________________ Added: svn:eol-style + native _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs