http://www.mediawiki.org/wiki/Special:Code/MediaWiki/91025

Revision: 91025
Author:   rfaulk
Date:     2011-06-28 22:09:09 +0000 (Tue, 28 Jun 2011)
Log Message:
-----------
contents moved to repository 
http://svn.wikimedia.org/viewvc/wikimedia/trunk/fundraiser-analysis/

Modified Paths:
--------------
    trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject

Removed Paths:
-------------
    trunk/fundraiser-statistics/fundraiser-scripts/__init__.py
    trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py
    trunk/fundraiser-statistics/fundraiser-scripts/html/
    trunk/fundraiser-statistics/fundraiser-scripts/logs/
    trunk/fundraiser-statistics/fundraiser-scripts/mine-log/
    trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py
    trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py
    trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py
    trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py
    trunk/fundraiser-statistics/fundraiser-scripts/settings.py
    trunk/fundraiser-statistics/fundraiser-scripts/squid_logs/
    trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py
    trunk/fundraiser-statistics/fundraiser-scripts/wrappers/

Modified: trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject        
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject        
2011-06-28 22:09:09 UTC (rev 91025)
@@ -5,6 +5,8 @@
 <pydev_property 
name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
 <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 
2.7</pydev_property>
 <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
-<path>/Fundraiser_Tools/src</path>
+<path>/Fundraiser_Tools</path>
+<path>/Fundraiser_Tools/classes</path>
+<path>/Fundraiser_Tools/wrappers</path>
 </pydev_pathproperty>
 </pydev_project>

Deleted: trunk/fundraiser-statistics/fundraiser-scripts/__init__.py
===================================================================
Deleted: 
trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py   
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py   
2011-06-28 22:09:09 UTC (rev 91025)
@@ -1,62 +0,0 @@
-
-"""
-
-data_aggregation_script.py
-
-wikimediafoundation.org
-Ryan Faulkner
-Novemner 8th, 2010
-
-"""
-
-import sys
-import MySQLdb
-import model_data as md
-
-
-"""
-Parse the input args
-
-"""
-try:
-       start_time = sys.argv[1]
-       end_time = sys.argv[2]
-       banner = sys.argv[3]
-       # sql_file_name = sys.argv[3]
-       
-except IndexError:
-       sys.exit('Please enter a UTC start and end time: \n' \
-       + ' e.g. $data_aggregation_scipt.py 20101010000000 20101010010000 
2010_testing50\n')
-       
-       
-
-"""
-INITIALIZE DB ACCESS
-
-"""
-
-""" Establish connection """
-# db = MySQLdb.connect(host='localhost', user='root', passwd='baggin5', 
db='faulkner')
-db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner')
-
-""" Create cursor """
-cur = db.cursor()
-
-
-"""
-Generate a unique test ID
-
-"""
-
-sql_file_name_2step = './sql/full_analytics_2step.sql'
-sql_file_name_1step = './sql/full_analytics_1step.sql'
-# sql_file_name = './sql/test.sql'
-# sql_file_name = './sql/landpage_test.sql'
-# start_time = '20101104234000' 
-# end_time = '20101105010000'
-
-md.compile_test_instance_data(db, cur, sql_file_name_2step, 
sql_file_name_1step, start_time, end_time)
-
-# Close connection
-cur.close()
-db.close()

Deleted: trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py     
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py     
2011-06-28 22:09:09 UTC (rev 91025)
@@ -1,266 +0,0 @@
-
-"""
-
-mine_contacts.py
-
-wikimediafoundation.org
-Ryan Faulkner
-February 11th, 2010
-
-"""
-
-# =============================================
-# Pulls metrics from database to perform statistical analysis
-# =============================================
-
-import sys
-import getopt
-import re
-import datetime
-
-import MySQLdb
-import HTML
-
-import test_reporting as tr
-import miner_help as mh
-import query_store as qs
-import fundraiser_reporting as fr
-
-
-
-class contact_handler(tr.data_handler, fr.TimestampProcesser):
-
-       __query_obj = qs.query_store()
-       __file_name = './csv/Rebeccas_Contacts_Donation_Alerts_mod.csv'
-       __sql_path = './sql/'
-       __html_path = './html/'
-       __query_handle_contact = 'report_ecomm_by_contact'
-       __query_handle_amount = 'report_ecomm_by_amount'
-               
-       
-       
-       def __init__(self):             
-               super(tr.data_handler, self).__init__()
-               
-       """
-       
-       method :: parse_contacts:
-       
-       Constants:  
-       =========
-       
-       file_name = ./csv/Rebeccas_Contacts_Donation_Alerts.csv
-       
-       """
-       
-       def parse_contacts(self):
-               
-               # Initialization - open the file
-               # FileName = sys.argv[1];
-               if (re.search('\.gz',self.__file_name)):
-                       file_obj = gzip.open(self.__file_name, 'r')
-               else:
-                       file_obj = open(self.__file_name, 'r')
-                       
-               
-               # Sample from csv file
-               # Internal Contact ID,  Sort Name,      First Name,     Last 
Name,      Contact Type
-               # 120842,       "Abramowicz, David",    David,  Abramowicz,     
Individual
-               index_first_name = 3
-               index_last_name = 4
-               contact_list = list()
-               
-               
-               # PROCESS LOG FILE
-               # ================
-               line = file_obj.readline()
-               while (line != ''):
-                       
-                       lineArgs = line.split(',')              
-                       # print lineArgs[index_first_name] + ' ' + 
lineArgs[index_last_name]
-                       contact_list.append([lineArgs[index_first_name], 
lineArgs[index_last_name]])
-                       line = file_obj.readline()
-       
-               return contact_list
-
-
-       """
-       
-       Civi Reporting - Create a table from a list of contacts
-       
-       This method looks at 
-       
-       Constants:  
-       =========
-       
-       file_name = ./sql/Rebeccas_Contacts_Donation_Alerts.csv
-       
-       
-       """
-       def build_html_table_by_contact(self):
-               
-               
-               # get the contacts from the list
-               list = self.parse_contacts()
-               
-               table_data = []
-               sql_stmnt = mh.read_sql(self.__sql_path + 
self.__query_handle_contact + '.sql');
-               
-               # open the html files for writing
-               f = open(self.__html_path + self.__query_handle_contact + 
'.html', 'w')
-               
-               # construct contact where string
-               # iterate through the list
-               where_str = 'where '
-               first_item = 0
-               for i in list[1:]:
-                       
-                       # only process People now - not organizations or 
Households
-                       if not(re.search('Organization', i[0]) or 
re.search('Organization', i[1]) or re.search('Household', i[0]) or 
re.search('Household', i[1])):
-                               if not(first_item):
-                                       first_item = 1
-                               else:
-                                       where_str = where_str + ' or '
-                               
-                               where_str = where_str + '(first_name = \'' + 
i[0].strip() + '\' and last_name = \'' + i[1].strip() + '\')'
-               
-               # Formats the statement according to query type
-               select_stmnt = 
self.__query_obj.format_query(self.__query_handle_contact, sql_stmnt, 
[where_str])
-               #print select_stmnt
-               
-               # initialize the db and execute the query
-               self.init_db()
-               
-               try:
-                       
-                       # execute statement gathering amounts
-                       err_msg = select_stmnt
-                       self._cur.execute(select_stmnt)
-                       results = self._cur.fetchall()
-               
-                       for row in results:
-                               cpRow = self.listify(row)
-                               table_data.append(cpRow)
-                                               
-               except:
-                       self._db.rollback()
-                       sys.exit("Database Interface Exception:\n" + err_msg)
-               
-               self.close_db()
-               
-               # Construct the html table
-               header = 
self.__query_obj.get_query_header(self.__query_handle_contact)
-               t = HTML.table(table_data, header_row=header)
-               htmlcode = str(t)
-               
-               f.write(htmlcode)
-               f.close()
-               
-               return htmlcode
-
-
-
-       """
-       
-       Civi Reporting - Create a table from a list of contacts
-       Creates two html tables
-       
-       """
-       def build_html_table_by_amount(self):
-
-               
-               # Initialize times  
-               now = datetime.datetime.now()
-               hours_back = 24 * 7
-               
-               # start_time, end_time = self.gen_date_strings_hr(now, 
hours_back)
-               start_time, end_time = self.gen_date_strings(now, hours_back, 
2, 3)
-               
-
-               # Prepare SQL statements and tables
-               table_data = []
-               sql_stmnt = mh.read_sql(self.__sql_path + 
self.__query_handle_amount + '.sql');
-               
-               # open the html files for writing
-               f = open(self.__html_path + self.__query_handle_amount + 
'.html', 'w')
-               
-               # Formats the statement according to query type
-               select_stmnt = 
self.__query_obj.format_query(self.__query_handle_amount, sql_stmnt, 
[start_time, end_time])
-               
-               # initialize the db and execute the query
-               self.init_db()
-               
-               try:
-                       
-                       # execute statement gathering amounts
-                       err_msg = select_stmnt
-                       self._cur.execute(select_stmnt)
-                       results = self._cur.fetchall()
-                       
-                       for row in results:
-                               cpRow = self.listify(row)
-                               table_data.append(cpRow)
-                       
-                       
-               except:
-                       self._db.rollback()
-                       sys.exit("Database Interface Exception:\n" + err_msg)
-               
-               self.close_db()
-               
-               # Construct the html table
-               header = 
self.__query_obj.get_query_header(self.__query_handle_amount)
-               t = HTML.table(table_data, header_row=header)
-               htmlcode = str(t)
-               
-               f.write(htmlcode)
-               f.close()
-               
-               return htmlcode
-       
-       
-       
-"""
-
-method :: main
-
-to parse contact names invoke:
-
-rfaulkner@wmf128:~/trunk/projects/fundraiser-statistics/fundraiser-scripts$ 
python mine_contacts.py 
/home/rfaulkner/trunk/docs/Rebeccas_Contacts_Donation_Alerts.csv
-
-"""
-
-class Usage(Exception):
-    def __init__(self, msg):
-        self.msg = msg
-
-def main(argv=None):
-       if argv is None:
-               argv = sys.argv
-       try:
-               try:
-                       opts, args = getopt.getopt(argv[1:], "h", ["help"])
-               except getopt.error, msg:
-                       raise Usage(msg)
-                               
-       # more code, unchanged
-       except Usage, err:
-               print >>sys.stderr, err.msg
-               print >>sys.stderr, "for help use --help"
-               return 2
-
-       contact_handler_obj = contact_handler()
-       
-       # Construct HTML table - conditioned on input
-       if len(args) > 0:
-               if args[0] == 'c':
-                       contact_handler_obj.build_html_table_by_contact()
-               elif args[0] == 'a':
-                       contact_handler_obj.build_html_table_by_amount()
-               else:
-                       print 'Invalid option: enter "c" for civi contacts or 
"a" for amounts'
-       else:
-               print 'Invalid args: need at least one argument; enter "c" for 
civi contacts or "a" for amounts'
-       
-if __name__ == "__main__":
-    sys.exit(main())

Deleted: 
trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py   
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py   
2011-06-28 22:09:09 UTC (rev 91025)
@@ -1,205 +0,0 @@
-
-"""
-
-mine_impression_request.py
-
-wikimediafoundation.org
-Ryan Faulkner
-October 30th, 2010
-
-"""
-
-# =====================================
-# Script to mine impression data from squid logs
-# =====================================
-
-import MySQLdb
-import sys
-import urlparse as up
-import math
-
-import cgi
-import re
-import gzip
-
-# import numpy as np
-import miner_help as mh
-
-
-def mine_impression_requests(run_id, logFileName, db, cur):
-
-       # Initialization - open the file
-       # logFileName = sys.argv[1];
-       if (re.search('\.gz',logFileName)):
-               logFile = gzip.open(logFileName, 'r')
-       else:
-               logFile = open(logFileName, 'r')
-
-       queryIndex = 4;
-
-       counts = mh.AutoVivification()
-       insertStmt = 'INSERT INTO impression (utm_source, referrer, country, 
lang, counts, on_minute) values '
-
-       min_log = -1
-       hr_change = 0
-       clamp = 0
-
-       """ Clear the records for hour ahead of adding """
-       time_stamps = mh.get_timestamps(logFileName)
-       
-       start = time_stamps[0]
-       end = time_stamps[1]
-       
-       # Ensure that the range is correct; otherwise abort - critical that 
outside records are not deleted
-       time_diff = mh.get_timestamps_diff(start, end) 
-               
-       if math.fabs(time_diff) <= 1.0:
-               deleteStmnt = 'delete from impression where on_minute >= \'' + 
start + '\' and on_minute < \'' + end + '\';'
-               
-               try:
-                       # cur.execute(deleteStmnt)
-                       print >> sys.stdout, "Executed delete from impression: 
" + deleteStmnt
-               except:
-                       print >> sys.stderr, "Could not execute delete:\n" + 
deleteStmnt + "\nResuming insert ..."
-                       pass
-       else:
-               print >> sys.stdout, "Could not execute delete statement, DIFF 
too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end 
= " + end + "\nResuming insert ..."
-       
-       
-       # PROCESS LOG FILE
-       # ================
-
-       line = logFile.readline()
-       while (line != ''):
-
-               lineArgs = line.split()
-
-               # Filter out time data by minute -- if the time is not properly 
formatted skip the record
-               # 2010-11-12T20:56:43.237
-               
-               try:
-                       time_stamp = lineArgs[2]
-                       time_bits = time_stamp.split('T')
-                       date_fields = time_bits[0].split('-')
-                       time_fields = time_bits[1].split(':')
-                       minute = int(time_fields[1])
-               except (ValueError, IndexError):
-                       line = logFile.readline()
-                       continue
-                       # pass
-                       
-               # Weird and confusing logic used to deal with logs that aren't 
sequential
-
-               if minute == 0 and not(hr_change) and not(clamp):
-                       min_log = -1
-
-               if minute == 1:
-                       hr_change = 0
-                       clamp = 1
-
-               # =================
-
-               try:
-                       url = lineArgs[8]
-               except IndexError:
-                       url = 'Unavailable'
-
-               parsedUrl = up.urlparse(url)
-               query = parsedUrl[queryIndex]
-               queryBits = cgi.parse_qs(query)
-
-               # Extract - project, banner, language, & country data
-               project = ''
-               if ('db' in queryBits.keys()):
-                       project = queryBits['db'][0]
-
-               if (project == '' and 'sitename' in queryBits.keys()):
-                       sitename = queryBits['sitename'][0];
-                       if sitename:
-                               project = sitename
-                       else:
-                               project = 'NONE'
-
-               if ('banner' in queryBits.keys()):
-                       banner = queryBits['banner'][0]
-               else:
-                       banner = 'NONE'
-
-               if ('userlang' in queryBits.keys()):
-                       lang = queryBits['userlang'][0]
-               else:
-                       lang = 'NONE'
-
-               if ('country' in queryBits.keys()):
-                       country = queryBits['country'][0]
-               else:
-                       country = 'NONE'
-
-
-               try:
-                       counts[banner][country][project][lang] = 
counts[banner][country][project][lang] + 1
-               except TypeError:
-                       counts[banner][country][project][lang] = 1
-
-               """
-               try:
-                       counts[att_1][att_2][att_3] = 
counts[att_1][att_2][att_3] + 1
-               except TypeError:
-                       counts[att_1][att_2][att_3] = 1
-               """
-
-               # Break out impression data by minute
-               if min_log < minute and not(hr_change):
-
-                       if minute == 0:
-                               hr_change = 1
-
-                       min_log = minute
-                       time_stamp_in = "convert(\'" + date_fields[0] + 
date_fields[1] + date_fields[2] + time_fields[0] + time_fields[1]  + "00\', 
datetime)"
-
-
-                       # print time_stamp_in
-
-                       bannerKeys = counts.keys()
-                       for banner_ind in range(len(bannerKeys)):
-                               banner = bannerKeys[banner_ind]
-                               countryCounts = counts[banner]
-                               countryKeys = countryCounts.keys()
-
-                               for country_ind in range(len(countryKeys)):
-                                       country = countryKeys[country_ind]
-                                       projectCounts = countryCounts[country]
-                                       projectKeys = projectCounts.keys()
-
-                                       for project_ind in 
range(len(projectKeys)):
-                                               project = 
projectKeys[project_ind]
-                                               langCounts = 
projectCounts[project]
-                                               langKeys = langCounts.keys()
-
-                                               for lang_ind in 
range(len(langKeys)):
-                                                       lang = 
langKeys[lang_ind]
-                                                       count = langCounts[lang]
-
-                                                       try:
-                                                               val = '(\'' + 
banner + '\',\'' + project + '\',\'' + country + '\',\'' + lang + '\',' \
-                                                               + str(count) + 
',' + time_stamp_in + ');'
-                                                               
cur.execute(insertStmt + val)
-                                                       except:
-                                                               db.rollback()
-                                                               
sys.exit("Database Interface Exception - Could not execute statement:\n" + 
insertStmt + val)
-
-                       # Re-initialize counts
-                       counts = mh.AutoVivification()
-
-               line = logFile.readline()
-
-
-       # ACCESS THE DB
-       # ==============
-
-
-
-       # commit insert ops
-       # db.commit()
-
-

Deleted: trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py        
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py        
2011-06-28 22:09:09 UTC (rev 91025)
@@ -1,282 +0,0 @@
-
-
-"""
-
-mine_landing_pages.py
-
-wikimediafoundation.org
-Ryan Faulkner
-October 30th, 2010
-
-"""
-
-# =====================================
-# Script to mine landing page data from squid logs
-# =====================================
-
-import sys
-import urlparse as up
-import httpagentparser
-import math
-
-import cgi     # web queries
-import re              # regular expression matching
-import gzip    # process gzipped logs
-
-import MySQLdb # db access
-
-import miner_help as mh
-
-# import urllib - python 3+
-
-def mine_landing_pages(run_id, logFileName, db, cur):
-
-       count_parse = 0
-       
-       # open the file
-       # logFileName = sys.argv[1]
-       if (re.search('\.gz',logFileName)):
-               logFile = gzip.open(logFileName, 'r')
-       else:
-               logFile = open(logFileName, 'r')
-
-
-       # Initialization
-       hostIndex = 1;
-       queryIndex = 4;
-       pathIndex = 2;
-
-       """ SQL Statements """
-
-       insertStmt_lp = 'INSERT INTO landing_page (utm_source, utm_campaign, 
utm_medium, landing_page,' + \
-       'page_url, referrer_url, browser, lang, country, project, ip, 
request_time) values '
-
-       """ Clear the records for hour ahead of adding """
-       time_stamps = mh.get_timestamps(logFileName)
-       
-       start = time_stamps[0]
-       end = time_stamps[1]
-       
-       # Ensure that the range is correct; otherwise abort - critical that 
outside records are not deleted
-       time_diff = mh.get_timestamps_diff(start, end) 
-               
-       if math.fabs(time_diff) <= 1.0:
-               deleteStmnt = 'delete from landing_page where request_time >= 
\'' + start + '\' and request_time < \'' + end + '\';'
-               
-               try:
-                       # cur.execute(deleteStmnt)
-                       print >> sys.stdout, "Executed delete from landing 
page: " + deleteStmnt
-               except:
-                       print >> sys.stderr, "Could not execute delete:\n" + 
deleteStmnt + "\nResuming insert ..."
-                       pass
-       else:
-               print >> sys.stdout, "Could not execute delete statement, DIFF 
too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end 
= " + end + "\nResuming insert ..."
-       
-       
-       count_correct = 0
-       count_total = 0
-       
-       # PROCESS LOG FILE
-       # ================
-       line = logFile.readline()
-       while (line != ''):
-               lineArgs = line.split()
-
-               # Get the IP Address of the donor
-               ip_add = lineArgs[4];
-
-
-               # Process Timestamp
-               # ================
-               # 2010-10-21T23:55:01.431
-               #  SELECT CAST('20070529 00:00:00' AS datetime)
-
-
-               datetime = lineArgs[2];
-
-               date_string = datetime.split('-')
-               time_string = datetime.split(':')
-
-               # if the date is not logged ignoere the record
-               try:
-                       year = date_string[0]
-                       month = date_string[1]
-                       day = date_string[2][:2]
-                       hour = time_string[0][-2:]
-                       min = time_string[1]
-                       sec = time_string[2][:2]
-               except:
-                       line = logFile.readline()
-                       continue
-
-               timestamp_string = year + '-' + month + '-' + day + " " + hour 
+ ":" + min + ":" + sec
-
-
-               # Process referrer URL
-               # ================
-
-               try:
-                       referrer_url = lineArgs[11]
-               except IndexError:
-                       referrer_url  = 'Unavailable'
-                       
-               parsed_referrer_url = up.urlparse(referrer_url)
-
-               if (parsed_referrer_url[hostIndex] == None):
-                       project = 'NONE';
-                       source_lang = 'NONE';
-               else:
-                       hostname = parsed_referrer_url[hostIndex].split('.')
-                       
-                       if ( len( hostname[0] ) <= 2 ) :
-                               # referrer_path = 
parsed_referrer_url[pathIndex].split('/')
-                               project = hostname[0]                           
# wikimediafoundation.org
-                               source_lang = hostname[0]
-                       else:
-                               try:
-                                       project = hostname[0] if ( hostname[1] 
== 'wikimedia' ) else hostname[1]  # species.wikimedia vs en.wikinews
-                                       source_lang = hostname[0] if ( 
len(hostname[1]) < 5 ) else 'en'  # pl.wikipedia vs commons.wikimedia
-                               except:
-                                       project = ''
-                                       source_lang = 'en'
-                       
-               # Process User agent string
-               # =====================
-               
-               try:
-                       user_agent_string = lineArgs[13]
-               except IndexError:
-                       user_agent_string = ''
-                       
-               user_agent_fields = httpagentparser.detect(user_agent_string)
-               browser = 'NONE'
-
-               # Check to make sure fields exist
-               if len(user_agent_fields['browser']) != 0:
-                       if len(user_agent_fields['browser']['name']) != 0:
-                               browser = user_agent_fields['browser']['name']
-
-
-               # Process landing URL
-               # ================
-
-               try:
-                       landing_url = lineArgs[8]
-               except IndexError:
-                       landing_url = 'Unavailable'
-
-               
-               parsed_landing_url = up.urlparse(landing_url)
-               query_fields = cgi.parse_qs(parsed_landing_url[queryIndex]) # 
Get the banner name and lang
-               
-               
-               # Filter the landing URLs
-               #
-               # /wikimediafoundation.org/wiki/WMF/
-               # /wikimediafoundation.org/w/index.php?title=WMF/
-               path_pieces = parsed_landing_url[pathIndex].split('/')
-               try:
-                       
-                       c1 = re.search('WMF', path_pieces[2] )  != None
-                       c2 = re.search('Hear_from_Kartika', path_pieces[2])   
!= None
-                       cond1 = parsed_landing_url[hostIndex] == 
'wikimediafoundation.org' and path_pieces[1] == 'wiki' and (c1 or c2)
-                       
-                       c1 = re.search('index.php', path_pieces[2] )  != None
-                       index_str_flag = c1
-                       try:
-                               c2 = re.search('WMF', query_fields['title'][0] 
) != None
-                       except KeyError:
-                               c2 = 0
-                       cond2 = (parsed_landing_url[hostIndex] == 
'wikimediafoundation.org' and path_pieces[1] == 'w' and c1 and c2)
-                       
-                       if cond2:
-                               count_parse = count_parse + 1
-                               
-                       regexp_res = 
re.search('Special:LandingCheck',landing_url)
-                       cond3 = (regexp_res == None)
-                       
-                       include_request = (cond1 or cond2) and cond3
-                        
-
-               except: 
-                       include_request = 0
-               
-               if include_request:
-                       
-                       # Address cases where the query string contains the 
landing page - ...wikimediafoundation.org/w/index.php?...
-                       if index_str_flag:
-                               try:
-                                       # URLs of the form ...?title=<lp_name>
-                                       lp_country = 
query_fields['title'][0].split('/')
-                                       landing_page = lp_country[0]
-                                       
-                                       # URLs of the form 
...?county_code=<iso_code>
-                                       try:
-                                               country = 
query_fields['country_code'][0]
-                                               
-                                       # URLs of the form 
...?title=<lp_name>/<lang>/<iso_code>
-                                       except:
-                                               if len(lp_country) == 3:
-                                                       country = lp_country[2]
-                                               else:
-                                                       country = lp_country[1]
-                                               
-                               except:
-                                       landing_page = 'NONE'
-                                       country = mh.localize_IP(cur, ip_add)
-                                       
-                       else: # ...wikimediafoundation.org/wiki/...
-                               
-                               landing_path = 
parsed_landing_url[pathIndex].split('/')
-                               landing_page = landing_path[2];
-                               
-                               # URLs of the form ...?county_code=<iso_code>
-                               try:
-                                       country = 
query_fields['country_code'][0]
-                               
-                               # URLs of the form ...<path>/ 
<lp_name>/<lang>/<iso_code>
-                               except:
-                                       try:
-                                               if len(landing_path) == 5:
-                                                       country = 
landing_path[4] 
-                                                       # source_lang = 
landing_path[3]                                                         
-                                               else:
-                                                       country = 
landing_path[3]
-                                                       
-                                       except:
-                                               country =  mh.localize_IP(cur, 
ip_add) 
-                       
-                       # If country is confused with the language use the ip
-                       if country == country.lower():
-                               country = mh.localize_IP(cur, ip_add) 
-                                                       
-                       # ensure fields exist
-                       try:
-                               utm_source = query_fields['utm_source'][0]
-                               utm_campaign = query_fields['utm_campaign'][0]
-                               utm_medium = query_fields['utm_medium'][0];
-
-                       except KeyError:
-                               utm_source = 'NONE'
-                               utm_campaign = 'NONE'
-                               utm_medium = 'NONE'
-                               
-                       # INSERT INTO landing_page ('utm_source', 
'utm_campaign', 'utm_medium', 'landing_page', 'page_url', 'lang', 'project', 
'ip')  values ()
-                       try:
-                               val = '(\'' + utm_source + '\',\'' + 
utm_campaign + '\',\'' + utm_medium + '\',\'' + landing_page + \
-                               '\',\'' + landing_url + '\',\'' + referrer_url 
+ '\',\'' + browser + '\',\'' + source_lang + '\',\'' + country + '\',\''  \
-                               + project + '\',\'' +  ip_add + '\',' + 
'convert(\'' + timestamp_string + '\', datetime)' + ');'
-                               
-                               #print insertStmt + val
-                               cur.execute(insertStmt_lp + val)
-
-                       except:
-                               print "Could not insert:\n" + insertStmt_lp + 
val
-                               pass
-                               
-               line = logFile.readline()
-
-       
-
-       
-

Deleted: trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py        
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py        
2011-06-28 22:09:09 UTC (rev 91025)
@@ -1,268 +0,0 @@
-
-
-"""
-
-miner_helper.py
-
-wikimediafoundation.org
-Ryan Faulkner
-October 30th, 2010
-
-"""
-
-# ===================
-# Helper script for mining
-# ===================
-
-
-import sys
-import math
-import calendar as cal
-import csv
-import MySQLdb
-
-
-
-""" Determines the following hour based on the precise date to the hour """
-def getNextHour(year, month, day, hour):
-
-       lastDayofMonth = cal.monthrange(year,month)[1]
-
-       next_year = year
-       next_month = month
-       next_day = day
-       next_hour = hour + 1
-
-       if hour == 23:
-               next_hour = 0
-               if day == lastDayofMonth:
-                       next_day = 1
-                       if month == 12:
-                               next_month = 1
-                               next_year = year + 1
-
-       return [next_year, next_month, next_day, next_hour]
-
-""" Determines the previous hour based on the precise date to the hour """
-def getPrevHour(year, month, day, hour):
-       
-       if month == 1:
-               last_year = year - 1
-               last_month = 12
-       else:
-               last_year = year
-               last_month = month - 1
-               
-       lastDayofPrevMonth = cal.monthrange(year,last_month)[1]
-               
-       prev_year = year
-       prev_month = month
-       prev_day = day
-       prev_hour = hour - 1
-
-       if prev_hour == -1:
-               prev_hour = 23
-               if day == 1:
-                       prev_day = lastDayofPrevMonth
-                       prev_month = last_month
-                       prev_year = last_year
-               else:
-                       prev_day = day - 1
-                       
-       return [prev_year, prev_month, prev_day, prev_hour]
-
-
-class AutoVivification(dict):
-       """Implementation of perl's autovivification feature."""
-       def __getitem__(self, item):
-               try:
-                       return dict.__getitem__(self, item)
-               except KeyError:
-                       value = self[item] = type(self)()
-                       return value
-
-def read_sql(filename):
-
-       sql_file = open(filename, 'r')
-
-       sql_stmnt = ''
-       line = sql_file.readline()
-       while (line != ''):
-               sql_stmnt = sql_stmnt + line
-               line = sql_file.readline()
-       
-       sql_file.close()
-       
-       return sql_stmnt
-       
-def drange(start, stop, step):
-       
-       if step < 1:
-               gain = math.floor(1 / step)
-               lst = range(0, ((stop-start) * gain), 1)
-               return [start + x * step for x in lst]
-       else:
-               return range(start, stop, step)
-       
-
-def mod_list(lst, modulus):
-       return [x % modulus for x in lst]
-               
-""" Extract a timestamp from the filename """
-def get_timestamps(logFileName):
-       
-       fname_parts = logFileName.split('-')
-
-       year = int(fname_parts[1])
-       month = int(fname_parts[2])
-       day = int(fname_parts[3])
-       hour = int(fname_parts[4][0:2])
-       
-       # Is this an afternoon log?
-       afternoon = (fname_parts[4][2:4] == 'PM') 
-        
-       # Adjust the hour as necessary if == 12AM or *PM
-       if afternoon and hour < 12:
-               hour = hour + 12
-               
-       if not(afternoon) and hour == 12:
-               hour = 0
-
-       prev_hr = getPrevHour(year, month, day, hour)
-       
-       str_month = '0' + str(month) if month < 10 else str(month)
-       str_day = '0' + str(day) if day < 10 else str(day)
-       str_hour = '0' + str(hour) if hour < 10 else str(hour)
-       
-       prev_month = prev_hr[1] 
-       prev_day = prev_hr[2]
-       prev_hour = prev_hr[3]
-       str_prev_month = '0' + str(prev_month) if prev_month < 10 else 
str(prev_month)
-       str_prev_day = '0' + str(prev_day) if prev_day < 10 else str(prev_day)
-       str_prev_hour = '0' + str(prev_hour) if prev_hour < 10 else 
str(prev_hour)
-       
-       log_end = str(year) + str_month + str_day + str_hour + '5500'
-       log_start = str(prev_hr[0]) + str_prev_month + str_prev_day + 
str_prev_hour + '5500' 
-       
-       #log_start = str(year) + str(month) + str(day) + str(hour) + '5500'
-       #log_end = str(prev_hr[0]) + str(prev_hr[1]) + str(prev_hr[2]) + 
str(prev_hr[3]) + '5500' 
-
-       return [log_start, log_end]
-       
-
-""" Compute the difference among two timestamps """
-def get_timestamps_diff(timestamp_start, timestamp_end):
-       
-       year_1 = int(timestamp_start[0:4])
-       month_1 = int(timestamp_start[4:6])
-       day_1 = int(timestamp_start[6:8])
-       hr_1 = int(timestamp_start[8:10])
-       min_1 = int(timestamp_start[10:12])
-       
-       year_2 = int(timestamp_end[0:4])
-       month_2 = int(timestamp_end[4:6])
-       day_2 = int(timestamp_end[6:8])
-       hr_2 = int(timestamp_end[8:10])
-       min_2 = int(timestamp_end[10:12])
-       
-       t1 = cal.datetime.datetime(year=year_1, month=month_1, day=day_1, 
hour=hr_1, minute=min_1,second=0)
-       t2 = cal.datetime.datetime(year=year_2, month=month_2, day=day_2, 
hour=hr_2, minute=min_2,second=0)
-       
-       diff = t2 - t1
-       diff = float(diff.seconds) / 3600
-       
-       return diff
-       
-""" Converts a list to a dictionary or vice versa -- INCOMPLETE MAY BE USEFUL 
AT SOME FUTURE POINT """
-def convert_list_dict(collection):
-       
-       if type(collection) is dict:
-               new_collection = list()
-               
-       elif type(collection) is list:
-               new_collection = dict()
-               
-       else:
-               print "miner_help::convert_list_dict:  Invalid type, must be a 
list or a dictionary."
-               return 0;
-
-       return new_collection
-       
-""" Given an IP localizes the country """
-def localize_IP(cur, ip_string):
-       
-       # compute ip number
-       ip_fields = ip_string.split('.')
-       w = int(ip_fields[0])
-       x = int(ip_fields[1])
-       y = int(ip_fields[2])
-       z = int(ip_fields[3])
-       
-       ip_num = 16777216 * w + 65536 * x + 256 * y + z;
-       
-       sql_stmnt = 'select country_ISO_1 from ip_country where ' + str(ip_num) 
+ ' >= ip_from and ' + str(ip_num) + ' <= ip_to'
-       
-       try:
-               cur.execute(sql_stmnt)
-               row = cur.fetchone()
-       except:
-               db.rollback()
-               sys.exit("Could not execute: " + sql_stmnt)
-       
-       try:
-               country = row[0]
-       except:
-               country = ''
-       
-       return country
-       
-""" Load  data into the IP localization table to associate IPs with countries 
"""
-def load_IP_localization_table():
-       
-       """ Get db object / Create cursor  """
-       # db = MySQLdb.connect(host='127.0.0.1', user='rfaulk', db='faulkner', 
port=3307)
-       db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', 
db='faulkner')
-       cur = db.cursor()
-       
-       # Parse CSV file 
-       ipReader = csv.reader(open('./csv/IpToCountry.csv', 'rb'))
-       insert_stmnt = 'INSERT INTO ip_country VALUES '
-       # 
(ip_from,ip_to,registry,assigned,country_ISO_1,country_ISO_2,country_name) 
-       header = 1
-       for row in ipReader:
-               # skip the csv comments
-               if row[0][0] != '#':
-                       header = 0
-               
-               if not(header):
-                       
-                       for i in range(len(row)):
-                               pieces = row[i].split('\'')
-                               
-                               if len(pieces) > 1:
-                                       new_str = pieces[0]
-                                       
-                                       # remove single quotes from fields
-                                       for j in range(1,len(pieces)):
-                                               new_str = new_str + ' ' + 
pieces[j]
-                               
-                                       row[i] = new_str
-                                               
-                       vals = '\',\''.join(row)
-                       sql_stmnt = insert_stmnt + '(\'' + vals + '\')'
-                       
-                       print vals
-                       #cur.execute(sql_stmnt)
-                       try:
-                               cur.execute(sql_stmnt)
-                       except:
-                               db.rollback()
-                               sys.exit("Could not insert: " + sql_stmnt)
-       
-       
-       # Commit to the db
-       db.commit()
-
-       # Close connection
-       cur.close()
-       db.close()

Deleted: trunk/fundraiser-statistics/fundraiser-scripts/settings.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/settings.py  2011-06-28 
22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/settings.py  2011-06-28 
22:09:09 UTC (rev 91025)
@@ -1,12 +0,0 @@
-__home__ = '/home/rfaulkner/trunk/projects/'
-__project_home__ = '/home/rfaulkner/trunk/projects/Fundraiser_Tools/'
-__web_home__ = '/home/rfaulkner/trunk/projects/Fundraiser_Tools/web_reporting/'
-__squid_log_server__ = 'hume.wikimedia.org'
-__squid_log_home__ = '/a/static/uncompressed/udplogs/'
-__squid_log_local_home__ = __project_home__ + 'squid_logs/'
-
-__user__ = 'rfaulk'
-__db__ = 'faulkner'
-__db_server__ = '127.0.0.1'
-__db_port__ = 3307
-__pass__='baggin5'
\ No newline at end of file

Deleted: trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py        
2011-06-28 22:06:17 UTC (rev 91024)
+++ trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py        
2011-06-28 22:09:09 UTC (rev 91025)
@@ -1,79 +0,0 @@
-
-
-"""
-
-squid_miner_script.py
-
-wikimediafoundation.org
-Ryan Faulkner
-November 4th, 2010
-
-"""
-
-"""
-
-squid_miner_script.py:
-
-Script to mine landing page data from squid logs
-
-"""
-
-
-import sys
-import MySQLdb
-
-import miner_help as mh
-import mine_landing_pages as mlp
-import mine_impression_request as mir
-
-
-
-"""
-Process command line args
-
-1 - i for impressions, l for landingpages
-2 - log file name
-
-"""
-
-try:
-       mine_option = sys.argv[1]
-       logFileName = sys.argv[2]
-except IndexError:
-       sys.exit('Please enter a mining option and log file path and name\n i 
for impressions\n l for landing pages\n' \
-       + ' e.g. $squid_miner_script.py i bannerImpressions.log\n')
-
-if (mine_option != 'i' and mine_option != 'l') or len(mine_option) > 1 :
-       sys.exit('Please select a valid mining option\n i for impressions\n     
l for landing pages\n' \
-       +  ' e.g. $squid_miner_script.py i bannerImpressions.log\n')
-
-
-
-"""
-
-INITIALIZE DB ACCESS
-
-"""
-
-""" Establish connection """
-db = MySQLdb.connect(host='localhost', user='root', passwd='baggin5', 
db='faulkner')
-# db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner')
-#db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', 
db='faulkner')
-
-""" Create cursor """
-cur = db.cursor()
-
-
-
-run_id = 0
-if mine_option == 'l':
-       mlp.mine_landing_pages(run_id, logFileName, db, cur)
-else:
-       mir.mine_impression_requests(run_id, logFileName, db, cur)
-       
-# Commit to the db
-db.commit()
-
-# Close connection
-cur.close()
-db.close()


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to