http://www.mediawiki.org/wiki/Special:Code/MediaWiki/76211
Revision: 76211 Author: diederik Date: 2010-11-06 19:37:32 +0000 (Sat, 06 Nov 2010) Log Message: ----------- Fixes include: * utf8 support on console if proper fonts are installed * separation of concerns Modified Paths: -------------- trunk/tools/editor_trends/construct_datasets.py trunk/tools/editor_trends/manage.py trunk/tools/editor_trends/map_wiki_editors.py trunk/tools/editor_trends/optimize_editors.py trunk/tools/editor_trends/utils/utils.py Modified: trunk/tools/editor_trends/construct_datasets.py =================================================================== --- trunk/tools/editor_trends/construct_datasets.py 2010-11-06 19:22:16 UTC (rev 76210) +++ trunk/tools/editor_trends/construct_datasets.py 2010-11-06 19:37:32 UTC (rev 76211) @@ -134,33 +134,41 @@ input_queue = pc.load_queue(ids) q = Queue() generate_editor_dataset(input_queue, q, False, kwargs) - #generate_editor_dataset_launcher() - #retrieve_list_contributors() - #retrieve_edits_by_contributor() -def generate_editor_dataset_launcher(): + +def generate_editor_dataset_launcher(dbname): kwargs = {'nr_input_processors': 1, 'nr_output_processors': 1, 'debug': False, - 'dbname': 'enwiki', + 'dbname': dbname, } - ids = retrieve_editor_ids_mongo('enwiki', 'editors') - pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, **kwargs) + ids = retrieve_editor_ids_mongo(dbname, 'editors') + chunks = {} + parts = int(round(float(len(ids)) / 1, 0)) + a = 0 + for x in xrange(settings.NUMBER_OF_PROCESSES): + b = a + parts + chunks[x] = ids[a:b] + a = (x + 1) * parts + if a >= len(ids): + break + + pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, False, False, **kwargs) -def generate_editor_dataset_debug(): - ids = retrieve_editor_ids_mongo('enwiki', 'editors') +def generate_editor_dataset_debug(dbname): + ids = retrieve_editor_ids_mongo(dbname, 'editors') input_queue = pc.load_queue(ids) #write_dataset(input_queue, [], 'enwiki') kwargs = {'nr_input_processors': 1, 'nr_output_processors': 1, 'debug': True, - 'dbname': 'enwiki', + 'dbname': dbname, } generate_editor_dataset(input_queue, False, False, kwargs) if __name__ == '__main__': - #generate_editor_dataset_debug() - generate_editor_dataset_launcher() + #generate_editor_dataset_debug('test') + generate_editor_dataset_launcher('test') #debug_retrieve_edits_by_contributor_launcher() Modified: trunk/tools/editor_trends/manage.py =================================================================== --- trunk/tools/editor_trends/manage.py 2010-11-06 19:22:16 UTC (rev 76210) +++ trunk/tools/editor_trends/manage.py 2010-11-06 19:37:32 UTC (rev 76211) @@ -32,6 +32,8 @@ from utils import dump_downloader import split_xml_file import map_wiki_editors +import optimize_editors +import construct_datasets import config @@ -96,7 +98,7 @@ project = project.title() language_map = utils.invert_dict(languages.MAPPING) print 'Project: %s' % (project) - print 'Language: %s' % language_map[language_code] + print 'Language: %s' % language_map[language_code].decode('utf-8') print 'Input directory: %s' % location print 'Output directory: TODO' @@ -130,23 +132,37 @@ def extract_xml_file(args, location, file): path = config.detect_installed_program('7zip') - source = os.path.join(location, file) - p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)]) + p = None + + if settings.OS == 'Windows': + p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait() + elif settings.OS == 'Linux': + raise NotImplementedError + elif settings.OS == 'OSX': + raise NotImplementedError + else: + raise exceptions.PlatformNotSupportedError return p def mongodb_script_launcher(args, location, filename, project, language_code): print 'mongodb_script_launcher' map_wiki_editors.run_parse_editors(project, language_code, location) - #print args +def dataset_launcher(args, project): + print 'dataset launcher' + optimize_editors.run_optimize_editors(project) + construct_datasets.generate_editor_dataset_launcher(project) + + def all_launcher(args, location, filename, project, language_code): print 'all_launcher' dump_downloader_launcher(args, location, filename, project, language_code) split_xml_file_launcher(args, location, filename, project, language_code) mongodb_script_launcher(args, location, filename, project, language_code) + dataset_launcher(args, location, filename, project, language_code) def supported_languages(): @@ -165,23 +181,30 @@ languages.append(choice) languages.sort() for language in languages: - if first == None: + try: + if first != None and language.startswith(first): + print '%s' % language.decode('utf-8') + elif first == None: + print '%s' % language.decode('utf-8') + except UnicodeEncodeError: print '%s' % language - elif first != None and language.startswith(first): - print '%s' % language +def about(): + print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.' + print 'Written by Diederik van Liere (dvanli...@gmail.com).' + print 'This software comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to distribute it under certain conditions.' + print 'See the README.1ST file for more information.' + print '' + def main(): default_language = determine_default_language() file_choices = ('stub-meta-history.xml.gz', - 'stub-meta-current.xml.gz', - 'pages-meta-history.xml.7z', - 'pages-meta-current.xml.bz2') + 'stub-meta-current.xml.gz', + 'pages-meta-history.xml.7z', + 'pages-meta-current.xml.bz2') parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) - #group = parser.add_mutually_exclusive_group() - #group.add_argument('show_languages', action='store') - #group.add_argument('language', action='store') subparsers = parser.add_subparsers(help='sub-command help') parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.') @@ -202,6 +225,9 @@ parser_create = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') parser_create.set_defaults(func=mongodb_script_launcher) + parser_dataset = subparsers.add_parser('dataset', help='Create a dataset from the MongoDB and write it to a csv file.') + parser_dataset.set_defaults(func=dataset_launcher) + parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.') parser_all.set_defaults(func=all_launcher) @@ -230,6 +256,7 @@ args = parser.parse_args() config.load_configuration(args) locations = determine_file_locations(args) + about() show_settings(args, **locations) args.func(args, **locations) Modified: trunk/tools/editor_trends/map_wiki_editors.py =================================================================== --- trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 19:22:16 UTC (rev 76210) +++ trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 19:37:32 UTC (rev 76211) @@ -135,7 +135,7 @@ if settings.DEBUG: messages = {} vars = {} - + while True: try: if debug: @@ -261,7 +261,9 @@ for editor in cache[c]: editor_cache.add(editor, cache[c][editor]) cache[c] = {} - editor_cache.add('NEXT', '') + editor_cache.add('NEXT', '') + cache = {} + def load_bot_ids(): Modified: trunk/tools/editor_trends/optimize_editors.py =================================================================== --- trunk/tools/editor_trends/optimize_editors.py 2010-11-06 19:22:16 UTC (rev 76210) +++ trunk/tools/editor_trends/optimize_editors.py 2010-11-06 19:37:32 UTC (rev 76211) @@ -82,7 +82,7 @@ edits = editor['edits'] edits = sorted(edits, key=itemgetter('date')) edit_count = len(edits) - new_wikipedian = edits[9]['date'].year + new_wikipedian = edits[9]['date'] first_edit = edits[0]['date'] final_edit = edits[-1]['date'] edits_by_year = determine_edits_by_year(edits) @@ -91,7 +91,7 @@ output.insert({'editor': id, 'edits': edits, 'edits_by_year': edits_by_year, - 'year_joined': new_wikipedian, + 'new_wikipedian': new_wikipedian, 'edit_count': edit_count, 'final_edit': final_edit, 'first_edit': first_edit, Modified: trunk/tools/editor_trends/utils/utils.py =================================================================== --- trunk/tools/editor_trends/utils/utils.py 2010-11-06 19:22:16 UTC (rev 76210) +++ trunk/tools/editor_trends/utils/utils.py 2010-11-06 19:37:32 UTC (rev 76211) @@ -276,15 +276,22 @@ return d -def retrieve_file_list(location, extension): +def retrieve_file_list(location, extension, mask=''): + ''' + Retrieve a list of files from a specified location. + @location: either an absolute or relative path + @extension: only include files with extension (optional) + @mask: only include files that start with mask (optional) + + @return: a list of files matching the criteria + ''' all_files = os.listdir(location) if not extension.startswith('.'): extension = '.' + extension files = [] for file in all_files: - if file.endswith(extension): + if file.startswith(mask) and file.endswith(extension): files.append(file) - return files _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs