http://www.mediawiki.org/wiki/Special:Code/MediaWiki/91025
Revision: 91025 Author: rfaulk Date: 2011-06-28 22:09:09 +0000 (Tue, 28 Jun 2011) Log Message: ----------- contents moved to repository http://svn.wikimedia.org/viewvc/wikimedia/trunk/fundraiser-analysis/ Modified Paths: -------------- trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject Removed Paths: ------------- trunk/fundraiser-statistics/fundraiser-scripts/__init__.py trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py trunk/fundraiser-statistics/fundraiser-scripts/html/ trunk/fundraiser-statistics/fundraiser-scripts/logs/ trunk/fundraiser-statistics/fundraiser-scripts/mine-log/ trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py trunk/fundraiser-statistics/fundraiser-scripts/settings.py trunk/fundraiser-statistics/fundraiser-scripts/squid_logs/ trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py trunk/fundraiser-statistics/fundraiser-scripts/wrappers/ Modified: trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/.pydevproject 2011-06-28 22:09:09 UTC (rev 91025) @@ -5,6 +5,8 @@ <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property> <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property> <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH"> -<path>/Fundraiser_Tools/src</path> +<path>/Fundraiser_Tools</path> +<path>/Fundraiser_Tools/classes</path> +<path>/Fundraiser_Tools/wrappers</path> </pydev_pathproperty> </pydev_project> Deleted: trunk/fundraiser-statistics/fundraiser-scripts/__init__.py =================================================================== Deleted: trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/data_aggregation_script.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,62 +0,0 @@ - -""" - -data_aggregation_script.py - -wikimediafoundation.org -Ryan Faulkner -Novemner 8th, 2010 - -""" - -import sys -import MySQLdb -import model_data as md - - -""" -Parse the input args - -""" -try: - start_time = sys.argv[1] - end_time = sys.argv[2] - banner = sys.argv[3] - # sql_file_name = sys.argv[3] - -except IndexError: - sys.exit('Please enter a UTC start and end time: \n' \ - + ' e.g. $data_aggregation_scipt.py 20101010000000 20101010010000 2010_testing50\n') - - - -""" -INITIALIZE DB ACCESS - -""" - -""" Establish connection """ -# db = MySQLdb.connect(host='localhost', user='root', passwd='baggin5', db='faulkner') -db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner') - -""" Create cursor """ -cur = db.cursor() - - -""" -Generate a unique test ID - -""" - -sql_file_name_2step = './sql/full_analytics_2step.sql' -sql_file_name_1step = './sql/full_analytics_1step.sql' -# sql_file_name = './sql/test.sql' -# sql_file_name = './sql/landpage_test.sql' -# start_time = '20101104234000' -# end_time = '20101105010000' - -md.compile_test_instance_data(db, cur, sql_file_name_2step, sql_file_name_1step, start_time, end_time) - -# Close connection -cur.close() -db.close() Deleted: trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/mine_contacts.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,266 +0,0 @@ - -""" - -mine_contacts.py - -wikimediafoundation.org -Ryan Faulkner -February 11th, 2010 - -""" - -# ============================================= -# Pulls metrics from database to perform statistical analysis -# ============================================= - -import sys -import getopt -import re -import datetime - -import MySQLdb -import HTML - -import test_reporting as tr -import miner_help as mh -import query_store as qs -import fundraiser_reporting as fr - - - -class contact_handler(tr.data_handler, fr.TimestampProcesser): - - __query_obj = qs.query_store() - __file_name = './csv/Rebeccas_Contacts_Donation_Alerts_mod.csv' - __sql_path = './sql/' - __html_path = './html/' - __query_handle_contact = 'report_ecomm_by_contact' - __query_handle_amount = 'report_ecomm_by_amount' - - - - def __init__(self): - super(tr.data_handler, self).__init__() - - """ - - method :: parse_contacts: - - Constants: - ========= - - file_name = ./csv/Rebeccas_Contacts_Donation_Alerts.csv - - """ - - def parse_contacts(self): - - # Initialization - open the file - # FileName = sys.argv[1]; - if (re.search('\.gz',self.__file_name)): - file_obj = gzip.open(self.__file_name, 'r') - else: - file_obj = open(self.__file_name, 'r') - - - # Sample from csv file - # Internal Contact ID, Sort Name, First Name, Last Name, Contact Type - # 120842, "Abramowicz, David", David, Abramowicz, Individual - index_first_name = 3 - index_last_name = 4 - contact_list = list() - - - # PROCESS LOG FILE - # ================ - line = file_obj.readline() - while (line != ''): - - lineArgs = line.split(',') - # print lineArgs[index_first_name] + ' ' + lineArgs[index_last_name] - contact_list.append([lineArgs[index_first_name], lineArgs[index_last_name]]) - line = file_obj.readline() - - return contact_list - - - """ - - Civi Reporting - Create a table from a list of contacts - - This method looks at - - Constants: - ========= - - file_name = ./sql/Rebeccas_Contacts_Donation_Alerts.csv - - - """ - def build_html_table_by_contact(self): - - - # get the contacts from the list - list = self.parse_contacts() - - table_data = [] - sql_stmnt = mh.read_sql(self.__sql_path + self.__query_handle_contact + '.sql'); - - # open the html files for writing - f = open(self.__html_path + self.__query_handle_contact + '.html', 'w') - - # construct contact where string - # iterate through the list - where_str = 'where ' - first_item = 0 - for i in list[1:]: - - # only process People now - not organizations or Households - if not(re.search('Organization', i[0]) or re.search('Organization', i[1]) or re.search('Household', i[0]) or re.search('Household', i[1])): - if not(first_item): - first_item = 1 - else: - where_str = where_str + ' or ' - - where_str = where_str + '(first_name = \'' + i[0].strip() + '\' and last_name = \'' + i[1].strip() + '\')' - - # Formats the statement according to query type - select_stmnt = self.__query_obj.format_query(self.__query_handle_contact, sql_stmnt, [where_str]) - #print select_stmnt - - # initialize the db and execute the query - self.init_db() - - try: - - # execute statement gathering amounts - err_msg = select_stmnt - self._cur.execute(select_stmnt) - results = self._cur.fetchall() - - for row in results: - cpRow = self.listify(row) - table_data.append(cpRow) - - except: - self._db.rollback() - sys.exit("Database Interface Exception:\n" + err_msg) - - self.close_db() - - # Construct the html table - header = self.__query_obj.get_query_header(self.__query_handle_contact) - t = HTML.table(table_data, header_row=header) - htmlcode = str(t) - - f.write(htmlcode) - f.close() - - return htmlcode - - - - """ - - Civi Reporting - Create a table from a list of contacts - Creates two html tables - - """ - def build_html_table_by_amount(self): - - - # Initialize times - now = datetime.datetime.now() - hours_back = 24 * 7 - - # start_time, end_time = self.gen_date_strings_hr(now, hours_back) - start_time, end_time = self.gen_date_strings(now, hours_back, 2, 3) - - - # Prepare SQL statements and tables - table_data = [] - sql_stmnt = mh.read_sql(self.__sql_path + self.__query_handle_amount + '.sql'); - - # open the html files for writing - f = open(self.__html_path + self.__query_handle_amount + '.html', 'w') - - # Formats the statement according to query type - select_stmnt = self.__query_obj.format_query(self.__query_handle_amount, sql_stmnt, [start_time, end_time]) - - # initialize the db and execute the query - self.init_db() - - try: - - # execute statement gathering amounts - err_msg = select_stmnt - self._cur.execute(select_stmnt) - results = self._cur.fetchall() - - for row in results: - cpRow = self.listify(row) - table_data.append(cpRow) - - - except: - self._db.rollback() - sys.exit("Database Interface Exception:\n" + err_msg) - - self.close_db() - - # Construct the html table - header = self.__query_obj.get_query_header(self.__query_handle_amount) - t = HTML.table(table_data, header_row=header) - htmlcode = str(t) - - f.write(htmlcode) - f.close() - - return htmlcode - - - -""" - -method :: main - -to parse contact names invoke: - -rfaulkner@wmf128:~/trunk/projects/fundraiser-statistics/fundraiser-scripts$ python mine_contacts.py /home/rfaulkner/trunk/docs/Rebeccas_Contacts_Donation_Alerts.csv - -""" - -class Usage(Exception): - def __init__(self, msg): - self.msg = msg - -def main(argv=None): - if argv is None: - argv = sys.argv - try: - try: - opts, args = getopt.getopt(argv[1:], "h", ["help"]) - except getopt.error, msg: - raise Usage(msg) - - # more code, unchanged - except Usage, err: - print >>sys.stderr, err.msg - print >>sys.stderr, "for help use --help" - return 2 - - contact_handler_obj = contact_handler() - - # Construct HTML table - conditioned on input - if len(args) > 0: - if args[0] == 'c': - contact_handler_obj.build_html_table_by_contact() - elif args[0] == 'a': - contact_handler_obj.build_html_table_by_amount() - else: - print 'Invalid option: enter "c" for civi contacts or "a" for amounts' - else: - print 'Invalid args: need at least one argument; enter "c" for civi contacts or "a" for amounts' - -if __name__ == "__main__": - sys.exit(main()) Deleted: trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/mine_impression_request.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,205 +0,0 @@ - -""" - -mine_impression_request.py - -wikimediafoundation.org -Ryan Faulkner -October 30th, 2010 - -""" - -# ===================================== -# Script to mine impression data from squid logs -# ===================================== - -import MySQLdb -import sys -import urlparse as up -import math - -import cgi -import re -import gzip - -# import numpy as np -import miner_help as mh - - -def mine_impression_requests(run_id, logFileName, db, cur): - - # Initialization - open the file - # logFileName = sys.argv[1]; - if (re.search('\.gz',logFileName)): - logFile = gzip.open(logFileName, 'r') - else: - logFile = open(logFileName, 'r') - - queryIndex = 4; - - counts = mh.AutoVivification() - insertStmt = 'INSERT INTO impression (utm_source, referrer, country, lang, counts, on_minute) values ' - - min_log = -1 - hr_change = 0 - clamp = 0 - - """ Clear the records for hour ahead of adding """ - time_stamps = mh.get_timestamps(logFileName) - - start = time_stamps[0] - end = time_stamps[1] - - # Ensure that the range is correct; otherwise abort - critical that outside records are not deleted - time_diff = mh.get_timestamps_diff(start, end) - - if math.fabs(time_diff) <= 1.0: - deleteStmnt = 'delete from impression where on_minute >= \'' + start + '\' and on_minute < \'' + end + '\';' - - try: - # cur.execute(deleteStmnt) - print >> sys.stdout, "Executed delete from impression: " + deleteStmnt - except: - print >> sys.stderr, "Could not execute delete:\n" + deleteStmnt + "\nResuming insert ..." - pass - else: - print >> sys.stdout, "Could not execute delete statement, DIFF too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end = " + end + "\nResuming insert ..." - - - # PROCESS LOG FILE - # ================ - - line = logFile.readline() - while (line != ''): - - lineArgs = line.split() - - # Filter out time data by minute -- if the time is not properly formatted skip the record - # 2010-11-12T20:56:43.237 - - try: - time_stamp = lineArgs[2] - time_bits = time_stamp.split('T') - date_fields = time_bits[0].split('-') - time_fields = time_bits[1].split(':') - minute = int(time_fields[1]) - except (ValueError, IndexError): - line = logFile.readline() - continue - # pass - - # Weird and confusing logic used to deal with logs that aren't sequential - - if minute == 0 and not(hr_change) and not(clamp): - min_log = -1 - - if minute == 1: - hr_change = 0 - clamp = 1 - - # ================= - - try: - url = lineArgs[8] - except IndexError: - url = 'Unavailable' - - parsedUrl = up.urlparse(url) - query = parsedUrl[queryIndex] - queryBits = cgi.parse_qs(query) - - # Extract - project, banner, language, & country data - project = '' - if ('db' in queryBits.keys()): - project = queryBits['db'][0] - - if (project == '' and 'sitename' in queryBits.keys()): - sitename = queryBits['sitename'][0]; - if sitename: - project = sitename - else: - project = 'NONE' - - if ('banner' in queryBits.keys()): - banner = queryBits['banner'][0] - else: - banner = 'NONE' - - if ('userlang' in queryBits.keys()): - lang = queryBits['userlang'][0] - else: - lang = 'NONE' - - if ('country' in queryBits.keys()): - country = queryBits['country'][0] - else: - country = 'NONE' - - - try: - counts[banner][country][project][lang] = counts[banner][country][project][lang] + 1 - except TypeError: - counts[banner][country][project][lang] = 1 - - """ - try: - counts[att_1][att_2][att_3] = counts[att_1][att_2][att_3] + 1 - except TypeError: - counts[att_1][att_2][att_3] = 1 - """ - - # Break out impression data by minute - if min_log < minute and not(hr_change): - - if minute == 0: - hr_change = 1 - - min_log = minute - time_stamp_in = "convert(\'" + date_fields[0] + date_fields[1] + date_fields[2] + time_fields[0] + time_fields[1] + "00\', datetime)" - - - # print time_stamp_in - - bannerKeys = counts.keys() - for banner_ind in range(len(bannerKeys)): - banner = bannerKeys[banner_ind] - countryCounts = counts[banner] - countryKeys = countryCounts.keys() - - for country_ind in range(len(countryKeys)): - country = countryKeys[country_ind] - projectCounts = countryCounts[country] - projectKeys = projectCounts.keys() - - for project_ind in range(len(projectKeys)): - project = projectKeys[project_ind] - langCounts = projectCounts[project] - langKeys = langCounts.keys() - - for lang_ind in range(len(langKeys)): - lang = langKeys[lang_ind] - count = langCounts[lang] - - try: - val = '(\'' + banner + '\',\'' + project + '\',\'' + country + '\',\'' + lang + '\',' \ - + str(count) + ',' + time_stamp_in + ');' - cur.execute(insertStmt + val) - except: - db.rollback() - sys.exit("Database Interface Exception - Could not execute statement:\n" + insertStmt + val) - - # Re-initialize counts - counts = mh.AutoVivification() - - line = logFile.readline() - - - # ACCESS THE DB - # ============== - - - - # commit insert ops - # db.commit() - - Deleted: trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/mine_landing_pages.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,282 +0,0 @@ - - -""" - -mine_landing_pages.py - -wikimediafoundation.org -Ryan Faulkner -October 30th, 2010 - -""" - -# ===================================== -# Script to mine landing page data from squid logs -# ===================================== - -import sys -import urlparse as up -import httpagentparser -import math - -import cgi # web queries -import re # regular expression matching -import gzip # process gzipped logs - -import MySQLdb # db access - -import miner_help as mh - -# import urllib - python 3+ - -def mine_landing_pages(run_id, logFileName, db, cur): - - count_parse = 0 - - # open the file - # logFileName = sys.argv[1] - if (re.search('\.gz',logFileName)): - logFile = gzip.open(logFileName, 'r') - else: - logFile = open(logFileName, 'r') - - - # Initialization - hostIndex = 1; - queryIndex = 4; - pathIndex = 2; - - """ SQL Statements """ - - insertStmt_lp = 'INSERT INTO landing_page (utm_source, utm_campaign, utm_medium, landing_page,' + \ - 'page_url, referrer_url, browser, lang, country, project, ip, request_time) values ' - - """ Clear the records for hour ahead of adding """ - time_stamps = mh.get_timestamps(logFileName) - - start = time_stamps[0] - end = time_stamps[1] - - # Ensure that the range is correct; otherwise abort - critical that outside records are not deleted - time_diff = mh.get_timestamps_diff(start, end) - - if math.fabs(time_diff) <= 1.0: - deleteStmnt = 'delete from landing_page where request_time >= \'' + start + '\' and request_time < \'' + end + '\';' - - try: - # cur.execute(deleteStmnt) - print >> sys.stdout, "Executed delete from landing page: " + deleteStmnt - except: - print >> sys.stderr, "Could not execute delete:\n" + deleteStmnt + "\nResuming insert ..." - pass - else: - print >> sys.stdout, "Could not execute delete statement, DIFF too large\ndiff = " + str(time_diff) + "\ntime_start = " + start + "\ntime_end = " + end + "\nResuming insert ..." - - - count_correct = 0 - count_total = 0 - - # PROCESS LOG FILE - # ================ - line = logFile.readline() - while (line != ''): - lineArgs = line.split() - - # Get the IP Address of the donor - ip_add = lineArgs[4]; - - - # Process Timestamp - # ================ - # 2010-10-21T23:55:01.431 - # SELECT CAST('20070529 00:00:00' AS datetime) - - - datetime = lineArgs[2]; - - date_string = datetime.split('-') - time_string = datetime.split(':') - - # if the date is not logged ignoere the record - try: - year = date_string[0] - month = date_string[1] - day = date_string[2][:2] - hour = time_string[0][-2:] - min = time_string[1] - sec = time_string[2][:2] - except: - line = logFile.readline() - continue - - timestamp_string = year + '-' + month + '-' + day + " " + hour + ":" + min + ":" + sec - - - # Process referrer URL - # ================ - - try: - referrer_url = lineArgs[11] - except IndexError: - referrer_url = 'Unavailable' - - parsed_referrer_url = up.urlparse(referrer_url) - - if (parsed_referrer_url[hostIndex] == None): - project = 'NONE'; - source_lang = 'NONE'; - else: - hostname = parsed_referrer_url[hostIndex].split('.') - - if ( len( hostname[0] ) <= 2 ) : - # referrer_path = parsed_referrer_url[pathIndex].split('/') - project = hostname[0] # wikimediafoundation.org - source_lang = hostname[0] - else: - try: - project = hostname[0] if ( hostname[1] == 'wikimedia' ) else hostname[1] # species.wikimedia vs en.wikinews - source_lang = hostname[0] if ( len(hostname[1]) < 5 ) else 'en' # pl.wikipedia vs commons.wikimedia - except: - project = '' - source_lang = 'en' - - # Process User agent string - # ===================== - - try: - user_agent_string = lineArgs[13] - except IndexError: - user_agent_string = '' - - user_agent_fields = httpagentparser.detect(user_agent_string) - browser = 'NONE' - - # Check to make sure fields exist - if len(user_agent_fields['browser']) != 0: - if len(user_agent_fields['browser']['name']) != 0: - browser = user_agent_fields['browser']['name'] - - - # Process landing URL - # ================ - - try: - landing_url = lineArgs[8] - except IndexError: - landing_url = 'Unavailable' - - - parsed_landing_url = up.urlparse(landing_url) - query_fields = cgi.parse_qs(parsed_landing_url[queryIndex]) # Get the banner name and lang - - - # Filter the landing URLs - # - # /wikimediafoundation.org/wiki/WMF/ - # /wikimediafoundation.org/w/index.php?title=WMF/ - path_pieces = parsed_landing_url[pathIndex].split('/') - try: - - c1 = re.search('WMF', path_pieces[2] ) != None - c2 = re.search('Hear_from_Kartika', path_pieces[2]) != None - cond1 = parsed_landing_url[hostIndex] == 'wikimediafoundation.org' and path_pieces[1] == 'wiki' and (c1 or c2) - - c1 = re.search('index.php', path_pieces[2] ) != None - index_str_flag = c1 - try: - c2 = re.search('WMF', query_fields['title'][0] ) != None - except KeyError: - c2 = 0 - cond2 = (parsed_landing_url[hostIndex] == 'wikimediafoundation.org' and path_pieces[1] == 'w' and c1 and c2) - - if cond2: - count_parse = count_parse + 1 - - regexp_res = re.search('Special:LandingCheck',landing_url) - cond3 = (regexp_res == None) - - include_request = (cond1 or cond2) and cond3 - - - except: - include_request = 0 - - if include_request: - - # Address cases where the query string contains the landing page - ...wikimediafoundation.org/w/index.php?... - if index_str_flag: - try: - # URLs of the form ...?title=<lp_name> - lp_country = query_fields['title'][0].split('/') - landing_page = lp_country[0] - - # URLs of the form ...?county_code=<iso_code> - try: - country = query_fields['country_code'][0] - - # URLs of the form ...?title=<lp_name>/<lang>/<iso_code> - except: - if len(lp_country) == 3: - country = lp_country[2] - else: - country = lp_country[1] - - except: - landing_page = 'NONE' - country = mh.localize_IP(cur, ip_add) - - else: # ...wikimediafoundation.org/wiki/... - - landing_path = parsed_landing_url[pathIndex].split('/') - landing_page = landing_path[2]; - - # URLs of the form ...?county_code=<iso_code> - try: - country = query_fields['country_code'][0] - - # URLs of the form ...<path>/ <lp_name>/<lang>/<iso_code> - except: - try: - if len(landing_path) == 5: - country = landing_path[4] - # source_lang = landing_path[3] - else: - country = landing_path[3] - - except: - country = mh.localize_IP(cur, ip_add) - - # If country is confused with the language use the ip - if country == country.lower(): - country = mh.localize_IP(cur, ip_add) - - # ensure fields exist - try: - utm_source = query_fields['utm_source'][0] - utm_campaign = query_fields['utm_campaign'][0] - utm_medium = query_fields['utm_medium'][0]; - - except KeyError: - utm_source = 'NONE' - utm_campaign = 'NONE' - utm_medium = 'NONE' - - # INSERT INTO landing_page ('utm_source', 'utm_campaign', 'utm_medium', 'landing_page', 'page_url', 'lang', 'project', 'ip') values () - try: - val = '(\'' + utm_source + '\',\'' + utm_campaign + '\',\'' + utm_medium + '\',\'' + landing_page + \ - '\',\'' + landing_url + '\',\'' + referrer_url + '\',\'' + browser + '\',\'' + source_lang + '\',\'' + country + '\',\'' \ - + project + '\',\'' + ip_add + '\',' + 'convert(\'' + timestamp_string + '\', datetime)' + ');' - - #print insertStmt + val - cur.execute(insertStmt_lp + val) - - except: - print "Could not insert:\n" + insertStmt_lp + val - pass - - line = logFile.readline() - - - - - Deleted: trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,268 +0,0 @@ - - -""" - -miner_helper.py - -wikimediafoundation.org -Ryan Faulkner -October 30th, 2010 - -""" - -# =================== -# Helper script for mining -# =================== - - -import sys -import math -import calendar as cal -import csv -import MySQLdb - - - -""" Determines the following hour based on the precise date to the hour """ -def getNextHour(year, month, day, hour): - - lastDayofMonth = cal.monthrange(year,month)[1] - - next_year = year - next_month = month - next_day = day - next_hour = hour + 1 - - if hour == 23: - next_hour = 0 - if day == lastDayofMonth: - next_day = 1 - if month == 12: - next_month = 1 - next_year = year + 1 - - return [next_year, next_month, next_day, next_hour] - -""" Determines the previous hour based on the precise date to the hour """ -def getPrevHour(year, month, day, hour): - - if month == 1: - last_year = year - 1 - last_month = 12 - else: - last_year = year - last_month = month - 1 - - lastDayofPrevMonth = cal.monthrange(year,last_month)[1] - - prev_year = year - prev_month = month - prev_day = day - prev_hour = hour - 1 - - if prev_hour == -1: - prev_hour = 23 - if day == 1: - prev_day = lastDayofPrevMonth - prev_month = last_month - prev_year = last_year - else: - prev_day = day - 1 - - return [prev_year, prev_month, prev_day, prev_hour] - - -class AutoVivification(dict): - """Implementation of perl's autovivification feature.""" - def __getitem__(self, item): - try: - return dict.__getitem__(self, item) - except KeyError: - value = self[item] = type(self)() - return value - -def read_sql(filename): - - sql_file = open(filename, 'r') - - sql_stmnt = '' - line = sql_file.readline() - while (line != ''): - sql_stmnt = sql_stmnt + line - line = sql_file.readline() - - sql_file.close() - - return sql_stmnt - -def drange(start, stop, step): - - if step < 1: - gain = math.floor(1 / step) - lst = range(0, ((stop-start) * gain), 1) - return [start + x * step for x in lst] - else: - return range(start, stop, step) - - -def mod_list(lst, modulus): - return [x % modulus for x in lst] - -""" Extract a timestamp from the filename """ -def get_timestamps(logFileName): - - fname_parts = logFileName.split('-') - - year = int(fname_parts[1]) - month = int(fname_parts[2]) - day = int(fname_parts[3]) - hour = int(fname_parts[4][0:2]) - - # Is this an afternoon log? - afternoon = (fname_parts[4][2:4] == 'PM') - - # Adjust the hour as necessary if == 12AM or *PM - if afternoon and hour < 12: - hour = hour + 12 - - if not(afternoon) and hour == 12: - hour = 0 - - prev_hr = getPrevHour(year, month, day, hour) - - str_month = '0' + str(month) if month < 10 else str(month) - str_day = '0' + str(day) if day < 10 else str(day) - str_hour = '0' + str(hour) if hour < 10 else str(hour) - - prev_month = prev_hr[1] - prev_day = prev_hr[2] - prev_hour = prev_hr[3] - str_prev_month = '0' + str(prev_month) if prev_month < 10 else str(prev_month) - str_prev_day = '0' + str(prev_day) if prev_day < 10 else str(prev_day) - str_prev_hour = '0' + str(prev_hour) if prev_hour < 10 else str(prev_hour) - - log_end = str(year) + str_month + str_day + str_hour + '5500' - log_start = str(prev_hr[0]) + str_prev_month + str_prev_day + str_prev_hour + '5500' - - #log_start = str(year) + str(month) + str(day) + str(hour) + '5500' - #log_end = str(prev_hr[0]) + str(prev_hr[1]) + str(prev_hr[2]) + str(prev_hr[3]) + '5500' - - return [log_start, log_end] - - -""" Compute the difference among two timestamps """ -def get_timestamps_diff(timestamp_start, timestamp_end): - - year_1 = int(timestamp_start[0:4]) - month_1 = int(timestamp_start[4:6]) - day_1 = int(timestamp_start[6:8]) - hr_1 = int(timestamp_start[8:10]) - min_1 = int(timestamp_start[10:12]) - - year_2 = int(timestamp_end[0:4]) - month_2 = int(timestamp_end[4:6]) - day_2 = int(timestamp_end[6:8]) - hr_2 = int(timestamp_end[8:10]) - min_2 = int(timestamp_end[10:12]) - - t1 = cal.datetime.datetime(year=year_1, month=month_1, day=day_1, hour=hr_1, minute=min_1,second=0) - t2 = cal.datetime.datetime(year=year_2, month=month_2, day=day_2, hour=hr_2, minute=min_2,second=0) - - diff = t2 - t1 - diff = float(diff.seconds) / 3600 - - return diff - -""" Converts a list to a dictionary or vice versa -- INCOMPLETE MAY BE USEFUL AT SOME FUTURE POINT """ -def convert_list_dict(collection): - - if type(collection) is dict: - new_collection = list() - - elif type(collection) is list: - new_collection = dict() - - else: - print "miner_help::convert_list_dict: Invalid type, must be a list or a dictionary." - return 0; - - return new_collection - -""" Given an IP localizes the country """ -def localize_IP(cur, ip_string): - - # compute ip number - ip_fields = ip_string.split('.') - w = int(ip_fields[0]) - x = int(ip_fields[1]) - y = int(ip_fields[2]) - z = int(ip_fields[3]) - - ip_num = 16777216 * w + 65536 * x + 256 * y + z; - - sql_stmnt = 'select country_ISO_1 from ip_country where ' + str(ip_num) + ' >= ip_from and ' + str(ip_num) + ' <= ip_to' - - try: - cur.execute(sql_stmnt) - row = cur.fetchone() - except: - db.rollback() - sys.exit("Could not execute: " + sql_stmnt) - - try: - country = row[0] - except: - country = '' - - return country - -""" Load data into the IP localization table to associate IPs with countries """ -def load_IP_localization_table(): - - """ Get db object / Create cursor """ - # db = MySQLdb.connect(host='127.0.0.1', user='rfaulk', db='faulkner', port=3307) - db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner') - cur = db.cursor() - - # Parse CSV file - ipReader = csv.reader(open('./csv/IpToCountry.csv', 'rb')) - insert_stmnt = 'INSERT INTO ip_country VALUES ' - # (ip_from,ip_to,registry,assigned,country_ISO_1,country_ISO_2,country_name) - header = 1 - for row in ipReader: - # skip the csv comments - if row[0][0] != '#': - header = 0 - - if not(header): - - for i in range(len(row)): - pieces = row[i].split('\'') - - if len(pieces) > 1: - new_str = pieces[0] - - # remove single quotes from fields - for j in range(1,len(pieces)): - new_str = new_str + ' ' + pieces[j] - - row[i] = new_str - - vals = '\',\''.join(row) - sql_stmnt = insert_stmnt + '(\'' + vals + '\')' - - print vals - #cur.execute(sql_stmnt) - try: - cur.execute(sql_stmnt) - except: - db.rollback() - sys.exit("Could not insert: " + sql_stmnt) - - - # Commit to the db - db.commit() - - # Close connection - cur.close() - db.close() Deleted: trunk/fundraiser-statistics/fundraiser-scripts/settings.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/settings.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/settings.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,12 +0,0 @@ -__home__ = '/home/rfaulkner/trunk/projects/' -__project_home__ = '/home/rfaulkner/trunk/projects/Fundraiser_Tools/' -__web_home__ = '/home/rfaulkner/trunk/projects/Fundraiser_Tools/web_reporting/' -__squid_log_server__ = 'hume.wikimedia.org' -__squid_log_home__ = '/a/static/uncompressed/udplogs/' -__squid_log_local_home__ = __project_home__ + 'squid_logs/' - -__user__ = 'rfaulk' -__db__ = 'faulkner' -__db_server__ = '127.0.0.1' -__db_port__ = 3307 -__pass__='baggin5' \ No newline at end of file Deleted: trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py =================================================================== --- trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py 2011-06-28 22:06:17 UTC (rev 91024) +++ trunk/fundraiser-statistics/fundraiser-scripts/squid_miner_script.py 2011-06-28 22:09:09 UTC (rev 91025) @@ -1,79 +0,0 @@ - - -""" - -squid_miner_script.py - -wikimediafoundation.org -Ryan Faulkner -November 4th, 2010 - -""" - -""" - -squid_miner_script.py: - -Script to mine landing page data from squid logs - -""" - - -import sys -import MySQLdb - -import miner_help as mh -import mine_landing_pages as mlp -import mine_impression_request as mir - - - -""" -Process command line args - -1 - i for impressions, l for landingpages -2 - log file name - -""" - -try: - mine_option = sys.argv[1] - logFileName = sys.argv[2] -except IndexError: - sys.exit('Please enter a mining option and log file path and name\n i for impressions\n l for landing pages\n' \ - + ' e.g. $squid_miner_script.py i bannerImpressions.log\n') - -if (mine_option != 'i' and mine_option != 'l') or len(mine_option) > 1 : - sys.exit('Please select a valid mining option\n i for impressions\n l for landing pages\n' \ - + ' e.g. $squid_miner_script.py i bannerImpressions.log\n') - - - -""" - -INITIALIZE DB ACCESS - -""" - -""" Establish connection """ -db = MySQLdb.connect(host='localhost', user='root', passwd='baggin5', db='faulkner') -# db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner') -#db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner') - -""" Create cursor """ -cur = db.cursor() - - - -run_id = 0 -if mine_option == 'l': - mlp.mine_landing_pages(run_id, logFileName, db, cur) -else: - mir.mine_impression_requests(run_id, logFileName, db, cur) - -# Commit to the db -db.commit() - -# Close connection -cur.close() -db.close() _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs