http://www.mediawiki.org/wiki/Special:Code/MediaWiki/93800

Revision: 93800
Author:   rfaulk
Date:     2011-08-03 06:42:41 +0000 (Wed, 03 Aug 2011)
Log Message:
-----------
Category Loader now collects data for in-degree, out-degree, subcategeories, 
and the nodes which only have edges in or out exclusively 

Modified Paths:
--------------
    trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py

Modified: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
===================================================================
--- trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py     2011-08-03 
05:56:29 UTC (rev 93799)
+++ trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py     2011-08-03 
06:42:41 UTC (rev 93800)
@@ -12,7 +12,7 @@
 
 
 """ Import python base modules """
-import sys, getopt, re, datetime, logging, MySQLdb, settings
+import sys, getopt, re, datetime, logging, MySQLdb, settings, operator
 import networkx as nx
 
 """ Import Analytics modules """
@@ -91,13 +91,13 @@
         self._query_names_['get_subcategories'] = "select cl_to from 
categorylinks_cp where cl_from = %s"
         self._query_names_['delete_from_recs'] = "delete from 
rfaulk.categorylinks_cp where cl_from = %s"
         self._query_names_['is_empty'] = "select * from 
rfaulk.categorylinks_cp limit 1"
-        self._query_names_['get_category_links'] = "select cl_from, cl_to from 
categorylinks_cp limit 100"
+        self._query_names_['get_category_links'] = "select cl_from, cl_to from 
categorylinks_cp limit 10000"
         
         WSORSlaveDataLoader.__init__(self)    
         logging.info('Creating CategoryLoader')
     
     """
-        
+        Retrieves all rows out of the category links table
     """
     def get_category_links(self):
         
@@ -236,8 +236,8 @@
     """ 
     def extract_hierarchy(self):
                         
-        #self.drop_category_links_cp_table()
-        #self.create_category_links_cp_table()
+        self.drop_category_links_cp_table()
+        self.create_category_links_cp_table()
                     
         """ Create graph """
         logging.info('Initializing directed graph...')
@@ -256,24 +256,83 @@
         links = self.get_category_links()
         count = 0
         
+        out_degrees = dict()
+        in_degrees = dict()
+        subcategories = dict()
+        
+        """ Process subcategory links """
         for row in links:
             
             cl_from = int(row[0])
             cl_to = str(row[1])
             cl_from = self.get_page_title(cl_from)
+        
+            try:
+                subcategories[cl_from].append(cl_to)
             
+            except KeyError:    
+                subcategories[cl_from] = list()
+                subcategories[cl_from].append(cl_to)
+                
+            try:
+                out_degrees[cl_from] = out_degrees[cl_from] + 1
+            except KeyError:
+                out_degrees[cl_from] = 1
+            
+            try:
+                in_degrees[cl_to] = in_degrees[cl_to] + 1
+            except KeyError:
+                in_degrees[cl_to] = 1
+                     
             directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
             
-            if self.__DEBUG__:
+            if self.__DEBUG__ and count % 1000 == 0:
                 
                 logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
-                count = count + 1
+            
+            count = count + 1
         
+        logging.info('Sorting in degree list.')
+        sorted_in_degrees = sorted(in_degrees.iteritems(), 
key=operator.itemgetter(1), reverse=True)
+        logging.info('Sorting out degree list.')
+        sorted_out_degrees = sorted(out_degrees.iteritems(), 
key=operator.itemgetter(1), reverse=True)
+        
+        in_only, out_only = 
self.get_uni_directionally_linked_categories(sorted_in_degrees, 
sorted_out_degrees)
+        
         logging.info('Category links finished processing.')
         
-        return directed_graph
+        return directed_graph, in_degrees, out_degrees, sorted_in_degrees, 
sorted_out_degrees, subcategories, in_only, out_only
     
     
+    """
+        Returns 
+    """
+    def get_uni_directionally_linked_categories(self, in_degrees, out_degrees):
+        
+        logging.info('Generating lists of categories have either only in 
degrees or out degrees.')
+        
+        in_keys = list()
+        for i in in_degrees:
+            in_keys.append(i[0])
+        
+        out_keys = list()
+        for i in out_degrees:
+            out_keys.append(i[0])
+        
+        in_only = list()
+        out_only = list()
+        
+        for i in in_degrees:
+            if not(i[0] in out_keys):
+                in_only.append(i) 
+            
+        for i in out_degrees:
+            if not(i[0] in in_keys):
+                out_only.append(i)
+        
+        return in_only, out_only
+    
+    
     """ drop rfaulk.categorylinks_cp """
     def drop_category_links_cp_table(self):
         


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to