http://www.mediawiki.org/wiki/Special:Code/MediaWiki/76201
Revision: 76201 Author: diederik Date: 2010-11-06 17:42:36 +0000 (Sat, 06 Nov 2010) Log Message: ----------- Various bugfixes Modified Paths: -------------- trunk/tools/editor_trends/config.py trunk/tools/editor_trends/manage.py trunk/tools/editor_trends/map_wiki_editors.py trunk/tools/editor_trends/optimize_editors.py trunk/tools/editor_trends/split_xml_file.py trunk/tools/editor_trends/utils/process_constructor.py trunk/tools/editor_trends/utils/utils.py Modified: trunk/tools/editor_trends/config.py =================================================================== --- trunk/tools/editor_trends/config.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/config.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -20,13 +20,14 @@ import os import ConfigParser -from _winreg import * + import settings from utils import utils def detect_windows_program(program): + from _winreg import * entry = settings.WINDOWS_REGISTER[program] try: key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) Modified: trunk/tools/editor_trends/manage.py =================================================================== --- trunk/tools/editor_trends/manage.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/manage.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -22,8 +22,8 @@ import subprocess from argparse import ArgumentParser from argparse import RawTextHelpFormatter +import locale - import progressbar import settings @@ -43,6 +43,11 @@ config.load_configuration(args) +def determine_default_language(): + language_code = locale.getdefaultlocale()[0] + return language_code.split('_')[0] + + def retrieve_projectname(args): language_code = retrieve_language(args) if language_code == None: @@ -53,13 +58,16 @@ if project == None: print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project') sys.exit(-1) + if project == 'commonswiki': + return project + else: + return '%s%s' % (language_code, project) - return '%s%s' % (language_code, project) def retrieve_language(args): language = get_value(args, 'language') language = language.title() - return languages.MAPPING.get(language, None) + return languages.MAPPING.get(language, 'en') def retrieve_project(args): @@ -75,13 +83,24 @@ def determine_file_locations(args): locations = {} + location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION locations['language_code'] = retrieve_language(args) - locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args)) + locations['location'] = os.path.join(location, retrieve_language(args)) locations['project'] = retrieve_projectname(args) locations['filename'] = generate_wikidump_filename(args) return locations +def show_settings(args, location, filename, project, language_code): + project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki') + project = project.title() + language_map = utils.invert_dict(languages.MAPPING) + print 'Project: %s' % (project) + print 'Language: %s' % language_map[language_code] + print 'Input directory: %s' % location + print 'Output directory: TODO' + + def dump_downloader_launcher(args, location, filename, project, language_code): print 'dump downloader' pbar = get_value(args, 'progress') @@ -113,8 +132,8 @@ path = config.detect_installed_program('7zip') source = os.path.join(location, file) - retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)]) - return retcode + p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)]) + return p def mongodb_script_launcher(args, location, filename, project, language_code): @@ -153,6 +172,7 @@ def main(): + default_language = determine_default_language() file_choices = ('stub-meta-history.xml.gz', 'stub-meta-current.xml.gz', 'pages-meta-history.xml.7z', @@ -188,7 +208,7 @@ parser.add_argument('-l', '--language', action='store', help='Example of valid languages.', choices=supported_languages(), - default='Russian') + default=default_language) parser.add_argument('-p', '--project', action='store', help='Specify the Wikimedia project that you would like to download', @@ -210,6 +230,7 @@ args = parser.parse_args() config.load_configuration(args) locations = determine_file_locations(args) + show_settings(args, **locations) args.func(args, **locations) Modified: trunk/tools/editor_trends/map_wiki_editors.py =================================================================== --- trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -244,6 +244,26 @@ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n) +def load_cache_objects(): + cache = {} + files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin') + for x, file in enumerate(files): + cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file) + return cache + + +def search_cache_for_missed_editors(dbname): + mongo = db.init_mongo_db(dbname) + collection = mongo['editors'] + editor_cache = cache.EditorCache(collection) + cache = load_cache_objects() + for c in cache: + for editor in cache[c]: + editor_cache.add(editor, cache[c][editor]) + cache[c] = {} + editor_cache.add('NEXT', '') + + def load_bot_ids(): ''' Loader function to retrieve list of id's of known Wikipedia bots. @@ -267,7 +287,6 @@ 'language': language, } chunks = {} - #file_location = os.path.join(settings.XML_FILE_LOCATION, language) files = utils.retrieve_file_list(location, 'xml') parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0)) a = 0 @@ -277,12 +296,14 @@ a = (x + 1) * parts pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs) + search_cache_for_missed_editors(dbname) def debug_parse_editors(dbname): q = JoinableQueue() parse_editors('en\\522.xml', q, None, None, True) store_editors(q, [], dbname) + search_cache_for_missed_editors(dbname) if __name__ == "__main__": Modified: trunk/tools/editor_trends/optimize_editors.py =================================================================== --- trunk/tools/editor_trends/optimize_editors.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/optimize_editors.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -17,11 +17,15 @@ __date__ = '2010-11-02' __version__ = '0.1' +from multiprocessing import Queue +from Queue import Empty +from operator import itemgetter +import datetime - import settings from database import db from utils import process_constructor as pc +import construct_datasets def create_datacontainer(init_value=0): @@ -37,7 +41,7 @@ data[str(x)] = init_value return data - + def determine_edits_by_year(dates): ''' This function counts the number of edits by year made by a particular editor. @@ -87,7 +91,7 @@ output.insert({'editor': id, 'edits': edits, 'edits_by_year': edits_by_year, - 'year_joined': year, + 'year_joined': new_wikipedian, 'edit_count': edit_count, 'final_edit': final_edit, 'first_edit': first_edit, @@ -101,20 +105,31 @@ kwargs = {'definition': 'traditional', 'pbar': True, 'dbname': 'enwiki', - 'nr_input_processors': 2, + 'nr_input_processors': 1, 'nr_output_processors': 0, } - pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs) + chunks = {} + parts = int(round(float(len(ids)) / 1, 0)) + a = 0 + for x in xrange(settings.NUMBER_OF_PROCESSES): + b = a + parts + chunks[x] = ids[a:b] + a = (x + 1) * parts + if a >= len(ids): + break + pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs) + def debug_optimize_editors(dbname): ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') q = pc.load_queue(ids) kwargs = {'definition': 'traditional', - 'dbname': 'enwiki' + 'dbname': dbname } optimize_editors(q, False, True, kwargs) if __name__ == '__main__': - run_optimize_editors('enwiki') \ No newline at end of file + debug_optimize_editors('test') + #run_optimize_editors('test') Modified: trunk/tools/editor_trends/split_xml_file.py =================================================================== --- trunk/tools/editor_trends/split_xml_file.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/split_xml_file.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -172,7 +172,7 @@ #elem = parse_comments(elem, remove_ascii_control_characters) #print cElementTree.tostring(elem) except SyntaxError: - fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING) + fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING) fh.write(cElementTree.tostring(elem)) fh.close() Modified: trunk/tools/editor_trends/utils/process_constructor.py =================================================================== --- trunk/tools/editor_trends/utils/process_constructor.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/utils/process_constructor.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -57,14 +57,16 @@ nr_output_processors = kwargs.pop('nr_output_processors') input_queues = {} result_queues = {} - assert len(obj) == nr_input_processors - if result_queue: - assert len(obj)== nr_output_processors + #assert len(obj) == nr_input_processors + #if result_queue: + # assert len(obj)== nr_output_processors for i, o in enumerate(obj): input_queues[i] = load_input_queue(obj[o], poison_pill=True) if result_queue: result_queues[i] = JoinableQueue() + else: + result_queues[i] = False if settings.PROGRESS_BAR: size = sum([input_queues[q].qsize() for q in input_queues]) Modified: trunk/tools/editor_trends/utils/utils.py =================================================================== --- trunk/tools/editor_trends/utils/utils.py 2010-11-06 17:35:15 UTC (rev 76200) +++ trunk/tools/editor_trends/utils/utils.py 2010-11-06 17:42:36 UTC (rev 76201) @@ -32,6 +32,7 @@ import ctypes import settings +import exceptions try: @@ -160,6 +161,7 @@ else: return 'wb' + def write_list_to_csv(data, fh, recursive=False): if recursive: recursive = False @@ -170,6 +172,7 @@ fh.write('%s\t' % d) if recursive: return True + def write_dict_to_csv(data, fh): keys = data.keys() @@ -225,7 +228,7 @@ if is_exe(exe_file): return exe_file - return None + raise exceptions.FileNotFoundException(program) def store_object(object, location, filename): @@ -254,6 +257,15 @@ return string +def invert_dict(dictionary): + ''' + @dictionary is a simple dictionary containing simple values, ie. no lists, + or other dictionaries + output: dictionary where key and value are swapped. + ''' + return dict([[v,k] for k,v in dictionary.items()]) + + def create_dict_from_csv_file(filename, encoding): d = {} for line in read_data_from_csv(filename, encoding): _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs