Marc, thank you for the example it made me realize where I was getting things wrong. I didn't realize how specific I needed to be. Also http://weitz.de/regex-coach/ really helped me test things out on this one. I realized I had some more exceptions like C18H34O2.1/2Cu and I also realized I didn't really understand regular expressions (which I still don't but I think it's getting better)
FORMULA = re.compile(r'([A-Z][A-Za-z0-9]+\.?[A-Za-z0-9]+/?[A-Za- z0-9]+)') This gets all Chemical names like C14H28 C18H34O2.1/2Cu C8H17ClO2, ie a word that begins with a capital letter followed by any number of upper or lower case letters and numbers followed by a possible . followed by any number of upper or lower case letters and numbers followed by a possible / followed by any number of upper or lower case letters and numbers. Say that five times fast! So now I want to tell the program that if it finds the formula at the end then continue, otherwise if it finds C.I. 75240 or any other type of word that it should not be broken by a | and be lumped into the whole line. But now I get: Traceback (most recent call last): File "C:\Python24\Lib\site-packages\pythonwin\pywin\framework \scriptutils.py", line 310, in RunScript exec codeObject in __main__.__dict__ File "C:\Documents and Settings\Patrick Waldo\My Documents\Python \WORD\try5-2-file-1-1.py", line 32, in ? input = codecs.open(input_text, 'r','utf8') File "C:\Python24\lib\codecs.py", line 666, in open file = __builtin__.open(filename, mode, buffering) IOError: [Errno 13] Permission denied: 'C:\\Documents and Settings\ \Patrick Waldo\\Desktop\\decernis\\DAD\\EINECS_SK\\text\\output' Ideas? #For text files in a directory... #Analyzes a randomly organized UTF8 document with EINECS, CAS, Chemical, and Chemical Formula #into a document structured as EINECS|CAS|Chemical|Chemical Formula. import os import codecs import re path = "C:\\text" path2 = "C:\\text\output" EINECS = re.compile(r'^\d\d\d-\d\d\d-\d $') FORMULA = re.compile(r'([A-Z][A-Za-z0-9]+\.?[A-Za-z0-9]+/?[A-Za- z0-9]+)') def iter_elements(tokens): product = [] for tok in tokens: if EINECS.match(tok) and len(product) >= 4: if product[-1] == FORMULA.findall(tok): product[2:-1] = [' '.join(product[2:-1])] yield product product = [] else: product[2:-1] = [' '.join(product[2:])] yield product product = [] product.append(tok) yield product for text in os.listdir(path): input_text = os.path.join(path,text) output_text = os.path.join(path2,text) input = codecs.open(input_text, 'r','utf8') output = codecs.open(output_text, 'w', 'utf8') tokens = input.read().split() for element in iter_elements(tokens): output.write('|'.join(element)) output.write("\r\n") input.close() output.close() -- http://mail.python.org/mailman/listinfo/python-list