http://www.mediawiki.org/wiki/Special:Code/MediaWiki/76211

Revision: 76211
Author:   diederik
Date:     2010-11-06 19:37:32 +0000 (Sat, 06 Nov 2010)
Log Message:
-----------
Fixes include:
* utf8 support on console if proper fonts are installed
* separation of concerns

Modified Paths:
--------------
    trunk/tools/editor_trends/construct_datasets.py
    trunk/tools/editor_trends/manage.py
    trunk/tools/editor_trends/map_wiki_editors.py
    trunk/tools/editor_trends/optimize_editors.py
    trunk/tools/editor_trends/utils/utils.py

Modified: trunk/tools/editor_trends/construct_datasets.py
===================================================================
--- trunk/tools/editor_trends/construct_datasets.py     2010-11-06 19:22:16 UTC 
(rev 76210)
+++ trunk/tools/editor_trends/construct_datasets.py     2010-11-06 19:37:32 UTC 
(rev 76211)
@@ -134,33 +134,41 @@
     input_queue = pc.load_queue(ids)
     q = Queue()
     generate_editor_dataset(input_queue, q, False, kwargs)
-    #generate_editor_dataset_launcher()
-    #retrieve_list_contributors()
-    #retrieve_edits_by_contributor()
 
-def generate_editor_dataset_launcher():
+
+def generate_editor_dataset_launcher(dbname):
     kwargs = {'nr_input_processors': 1,
               'nr_output_processors': 1,
               'debug': False,
-              'dbname': 'enwiki',
+              'dbname': dbname,
               }
-    ids = retrieve_editor_ids_mongo('enwiki', 'editors')
-    pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, 
False, **kwargs)
+    ids = retrieve_editor_ids_mongo(dbname, 'editors')
+    chunks = {}
+    parts = int(round(float(len(ids)) / 1, 0))
+    a = 0
+    for x in xrange(settings.NUMBER_OF_PROCESSES):
+        b = a + parts
+        chunks[x] = ids[a:b]
+        a = (x + 1) * parts
+        if a >= len(ids):
+            break
+        
+    pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, 
False, False, **kwargs)
 
 
-def generate_editor_dataset_debug():
-    ids = retrieve_editor_ids_mongo('enwiki', 'editors')
+def generate_editor_dataset_debug(dbname):
+    ids = retrieve_editor_ids_mongo(dbname, 'editors')
     input_queue = pc.load_queue(ids)
     #write_dataset(input_queue, [], 'enwiki')
     kwargs = {'nr_input_processors': 1,
               'nr_output_processors': 1,
               'debug': True,
-              'dbname': 'enwiki',
+              'dbname': dbname,
               }
     generate_editor_dataset(input_queue, False, False, kwargs)
 
 
 if __name__ == '__main__':
-    #generate_editor_dataset_debug()
-    generate_editor_dataset_launcher()
+    #generate_editor_dataset_debug('test')
+    generate_editor_dataset_launcher('test')
     #debug_retrieve_edits_by_contributor_launcher()

Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2010-11-06 19:22:16 UTC (rev 76210)
+++ trunk/tools/editor_trends/manage.py 2010-11-06 19:37:32 UTC (rev 76211)
@@ -32,6 +32,8 @@
 from utils import dump_downloader
 import split_xml_file
 import map_wiki_editors
+import optimize_editors
+import construct_datasets
 import config
 
 
@@ -96,7 +98,7 @@
     project = project.title()
     language_map = utils.invert_dict(languages.MAPPING)
     print 'Project: %s' % (project)
-    print 'Language: %s' % language_map[language_code]
+    print 'Language: %s' % language_map[language_code].decode('utf-8')
     print 'Input directory: %s' % location
     print 'Output directory: TODO'
   
@@ -130,23 +132,37 @@
 
 def extract_xml_file(args, location, file):
     path = config.detect_installed_program('7zip')
-
     source = os.path.join(location, file)
-    p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, 
'%s' % (source,)])
+    p = None
+    
+    if settings.OS == 'Windows':
+        p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % 
location, '%s' % (source,)], shell=True).wait()
+    elif settings.OS == 'Linux':
+        raise NotImplementedError
+    elif settings.OS == 'OSX':
+        raise NotImplementedError
+    else:
+        raise exceptions.PlatformNotSupportedError
     return p
 
 
 def mongodb_script_launcher(args, location, filename, project, language_code):
     print 'mongodb_script_launcher'
     map_wiki_editors.run_parse_editors(project, language_code, location)
-    #print args
 
 
+def dataset_launcher(args, project):
+    print 'dataset launcher'
+    optimize_editors.run_optimize_editors(project)
+    construct_datasets.generate_editor_dataset_launcher(project)
+
+
 def all_launcher(args, location, filename, project, language_code):
     print 'all_launcher'
     dump_downloader_launcher(args, location, filename, project, language_code)
     split_xml_file_launcher(args, location, filename, project, language_code)
     mongodb_script_launcher(args, location, filename, project, language_code)
+    dataset_launcher(args, location, filename, project, language_code)
 
 
 def supported_languages():
@@ -165,23 +181,30 @@
         languages.append(choice)
     languages.sort()
     for language in languages:
-        if first == None:
+        try:
+            if first != None and language.startswith(first):
+                print '%s' % language.decode('utf-8')
+            elif first == None:
+                print '%s' % language.decode('utf-8')
+        except UnicodeEncodeError:
             print '%s' % language
-        elif first != None and language.startswith(first):
-            print '%s' % language
 
+def about():
+    print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
+    print 'Written by Diederik van Liere (dvanli...@gmail.com).'
+    print 'This software comes with ABSOLUTELY NO WARRANTY. This is free 
software, and you are welcome to distribute it under certain conditions.'
+    print 'See the README.1ST file for more information.'
+    print '' 
+    
 
 def main():
     default_language = determine_default_language()
     file_choices = ('stub-meta-history.xml.gz',
-                  'stub-meta-current.xml.gz',
-                  'pages-meta-history.xml.7z',
-                  'pages-meta-current.xml.bz2')
+                    'stub-meta-current.xml.gz',
+                    'pages-meta-history.xml.7z',
+                    'pages-meta-current.xml.bz2')
 
     parser = ArgumentParser(prog='manage', 
formatter_class=RawTextHelpFormatter)
-    #group = parser.add_mutually_exclusive_group()
-    #group.add_argument('show_languages', action='store')
-    #group.add_argument('language', action='store')
     subparsers = parser.add_subparsers(help='sub-command help')
 
     parser_languages = subparsers.add_parser('show_languages', help='Overview 
of all valid languages.')
@@ -202,6 +225,9 @@
     parser_create = subparsers.add_parser('store', help='The store sub command 
parsers the XML chunk files, extracts the information and stores it in a 
MongoDB.')
     parser_create.set_defaults(func=mongodb_script_launcher)
 
+    parser_dataset = subparsers.add_parser('dataset', help='Create a dataset 
from the MongoDB and write it to a csv file.')
+    parser_dataset.set_defaults(func=dataset_launcher)
+    
     parser_all = subparsers.add_parser('all', help='The all sub command runs 
the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE 
DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE 
WIKIMEDIA DUMP FILE.')
     parser_all.set_defaults(func=all_launcher)
 
@@ -230,6 +256,7 @@
     args = parser.parse_args()
     config.load_configuration(args)
     locations = determine_file_locations(args)
+    about()
     show_settings(args, **locations)
     args.func(args, **locations)
 

Modified: trunk/tools/editor_trends/map_wiki_editors.py
===================================================================
--- trunk/tools/editor_trends/map_wiki_editors.py       2010-11-06 19:22:16 UTC 
(rev 76210)
+++ trunk/tools/editor_trends/map_wiki_editors.py       2010-11-06 19:37:32 UTC 
(rev 76211)
@@ -135,7 +135,7 @@
     if settings.DEBUG:
         messages = {}
         vars = {}
-    
+
     while True:
         try:
             if debug:
@@ -261,7 +261,9 @@
         for editor in cache[c]:
             editor_cache.add(editor, cache[c][editor])
         cache[c] = {}
-    editor_cache.add('NEXT', '')
+        editor_cache.add('NEXT', '')
+    cache = {}
+    
 
 
 def load_bot_ids():

Modified: trunk/tools/editor_trends/optimize_editors.py
===================================================================
--- trunk/tools/editor_trends/optimize_editors.py       2010-11-06 19:22:16 UTC 
(rev 76210)
+++ trunk/tools/editor_trends/optimize_editors.py       2010-11-06 19:37:32 UTC 
(rev 76211)
@@ -82,7 +82,7 @@
             edits = editor['edits']
             edits = sorted(edits, key=itemgetter('date'))
             edit_count = len(edits)
-            new_wikipedian = edits[9]['date'].year
+            new_wikipedian = edits[9]['date']
             first_edit = edits[0]['date']
             final_edit = edits[-1]['date']
             edits_by_year = determine_edits_by_year(edits)
@@ -91,7 +91,7 @@
 
             output.insert({'editor': id, 'edits': edits,
                            'edits_by_year': edits_by_year,
-                           'year_joined': new_wikipedian,
+                           'new_wikipedian': new_wikipedian,
                            'edit_count': edit_count,
                            'final_edit': final_edit,
                            'first_edit': first_edit,

Modified: trunk/tools/editor_trends/utils/utils.py
===================================================================
--- trunk/tools/editor_trends/utils/utils.py    2010-11-06 19:22:16 UTC (rev 
76210)
+++ trunk/tools/editor_trends/utils/utils.py    2010-11-06 19:37:32 UTC (rev 
76211)
@@ -276,15 +276,22 @@
     return d
 
 
-def retrieve_file_list(location, extension):
+def retrieve_file_list(location, extension, mask=''):
+    '''
+    Retrieve a list of files from a specified location.
+    @location: either an absolute or relative path
+    @extension: only include files with extension (optional)
+    @mask: only include files that start with mask (optional)
+    
+    @return: a list of files matching the criteria
+    '''
     all_files = os.listdir(location)
     if not extension.startswith('.'):
         extension = '.' + extension
     files = []
     for file in all_files:
-        if file.endswith(extension):
+        if file.startswith(mask) and file.endswith(extension):
             files.append(file)
-
     return files
 
 


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to