Hello I'm developing a script to compare two files, finding duplicate entries and matching which id of one csv file corresponds to the id of another csv file. The first version was working nice, but I wanted to postpone the writing to a file till the end and also make a correct csv file. The code is not so great and I expect to work with no more than 3000 lines of data in either file: So here is the inicial code. I hope it's not too long or complicated: import csv import re import addrnormalize import difflib import time
started = time.time() nobv = open('regnobv.csv', 'wb') yesbv = open('reginbv.csv', 'wb') bv = open(r'\\albertapdc\ESP Data\ESP Main Files\BV_Customersa.csv').read().upper() site = open(r'C:\myscripts\latestregistrants.csv').read().upper() site = re.sub(r'([0-9]{3})-([0-9]{3})-([0-9]{4})', r'\1\2\3', site) bvreader = csv.DictReader(bv.splitlines()) sitelist = csv.DictReader(site.splitlines()) def inbv(yesbv): yesbv.write(item['USER_ID'] + ',') yesbv.write(row['CUS_NO'] + ',') yesbv.write(item['COMPANY'] + ',') yesbv.write(row['BVADDR1'] + ',') yesbv.write(item['ADDRESSLINEONE']+ ',') yesbv.write(row['BVADDRTELNO1'] + ',') yesbv.write(item['PHONE'] + '\n') bvreader = list(bvreader) # or (row['NAME'] in item['COMPANY']) or (row['BVADDREMAIL'] in item['EMAIL']) for item in sitelist: for row in bvreader: if ((row['BVADDRTELNO1'] == item['PHONE'] and row['BVADDRTELNO1']) or (row['BVADDREMAIL'] == item['EMAIL'] and row['BVADDREMAIL'])): inbv(yesbv) break ## this module just makes a few string transformations to standardize both strings. Like STREET -> ST elif addrnormalize.format_address(row['BVADDR1']) == addrnormalize.format_address(item['ADDRESSLINEONE']) and row['BVADDR1'] and row['BVPROVSTATE'] == item['STATE'] and row['BVPROVSTATE']: inbv(yesbv) break ## trying some fuzzy matching here elif (difflib.SequenceMatcher(lambda x: x in " ,.-#" , row['BVADDR1'], item['ADDRESSLINEONE']).quick_ratio() > 0.87) \ and (difflib.SequenceMatcher(lambda x: x in " .-" , row['BVCITY'], item['CITY']).quick_ratio() > 0.87): inbv(yesbv) break else: nobv.write(item['USER_ID']+ ',') nobv.write(item['FIRSTNAME']+ ',') nobv.write(item['LASTNAME']+ ',') nobv.write(item['COMPANY']+ ',') nobv.write(item['EMAIL'].lower()+ ',') nobv.write(item['PHONE']+ ',') nobv.write(item['FAX']+ ',') nobv.write(item['ADDRESSLINEONE']+ ',') nobv.write(item['ADDRESSLINETWO']+ ',') nobv.write(item['CITY']+ ',') nobv.write(item['STATE']+ ',') nobv.write(item['POSTALCODE']+ ',') nobv.write(item['COUNTRY']+ ',') nobv.write('\n') nobv.close() yesbv.close() finished = time.time() print finished - started ---- End of code --- #### When I try with list it does not even print the "print linha" test #### If I uncomment all the conditionals except the first if than I get that written to the final file: reginbv. ### How is the new function with list affecting the results? import csv import re import addrnormalize import difflib import time started = time.time() nobv = open('regnobv.csv', 'wb') bv = open(r'\\albertapdc\ESP Data\ESP Main Files\BV_Customersa.csv').read().upper() site = open(r'C:\myscripts\latestregistrants.csv').read().upper() site = re.sub(r'([0-9]{3})-([0-9]{3})-([0-9]{4})', r'\1\2\3', site) bvreader = csv.DictReader(bv.splitlines()) sitelist = csv.DictReader(site.splitlines()) list2csv = [] list_not_in_bv = [] yesbv = csv.writer(open('reginbv.csv', 'wb'), dialect="excel") nobv = csv.writer(open('regnobv.csv', 'wb'), dialect="excel") def inbv(currentline = None): """writes a line of data when a date is found in BV""" if currentline is None: currentline = [] else: currentline.append(item['USER_ID']) currentline.append(row['CUS_NO']) currentline.append(item['COMPANY']) currentline.append(row['BVADDR1']) currentline.append(item['ADDRESSLINEONE']) currentline.append(row['BVADDRTELNO1']) currentline.append(item['PHONE']) currentline.append(row['BVCITY']) currentline.append(item['CITY']) return currentline def notinbv(currentline): if currentline is None: currentline = [] else: currentline.append(item['USER_ID']) currentline.append(item['FIRSTNAME']) currentline.append(item['LASTNAME']) currentline.append(item['COMPANY']) currentline.append(item['EMAIL']) currentline.append(item['PHONE']) currentline.append(item['FAX']) currentline.append(item['ADDRESSLINEONE']) currentline.append(item['ADDRESSLINETWO']) currentline.append(item['CITY']) currentline.append(item['STATE']) currentline.append(item['POSTALCODE']) currentline.append(item['COUNTRY']) return currentline bvreader = list(bvreader) # or (row['NAME'] in item['COMPANY']) or (row['BVADDREMAIL'] in item['EMAIL']) for item in sitelist: for row in bvreader: if ((row['BVADDRTELNO1'] == item['PHONE'] and row['BVADDRTELNO1']) or (row['BVADDREMAIL'] == item['EMAIL'] and row['BVADDREMAIL'])): lin = [] linha = inbv(lin) list2csv.append(linha) print linha break elif addrnormalize.format_address(row['BVADDR1']) == addrnormalize.format_address(item['ADDRESSLINEONE']) and row['BVADDR1'] and row['BVPROVSTATE'] == item['STATE'] and row['BVPROVSTATE']: lin = [] linha = inbv(lin) list2csv.append(linha) break ## elif (difflib.SequenceMatcher(lambda x: x in " ,.-#" , row['BVADDR1'], item['ADDRESSLINEONE']).quick_ratio() > 0.87) \ and (difflib.SequenceMatcher(lambda x: x in " .-" , row['BVCITY'], item['CITY']).quick_ratio() > 0.87): lin = [] linha = inbv(lin) list2csv.append(linha) break ## ## else: le = [] linha = notinbv(le) list_not_in_bv.append(linha) break print "now printing list2csv" print list2csv print list_not_in_bv for customer in list2csv: yesbv.writerow(customer) for customer in list_not_in_bv: nobv.writerow(customer) finished = time.time() print finished - started _______________________________________________ Tutor maillist - Tutor@python.org To unsubscribe or change subscription options: http://mail.python.org/mailman/listinfo/tutor