I am trying to extract all strings in nested parentheses (along with the parentheses itself) in my .txt file. Please see the sample .txt file that I have used in this example here: (https://drive.google.com/open?id=1UKc0ZgY9Fsz5O1rSeBCLqt5dwZkMaQgr).
I have tried and done up three different codes but none of them seems to be able to extract all the nested parentheses. They can only extract a portion of the nested parentheses. Any advice on what I've done wrong could really help! Here are the three codes I have done so far: 1st attempt: import re from os.path import join def balanced_braces(args): parts = [] for arg in args: if '(' not in arg: continue chars = [] n = 0 for c in arg: if c == '(': if n > 0: chars.append(c) n += 1 elif c == ')': n -= 1 if n > 0: chars.append(c) elif n == 0: parts.append(''.join(chars).lstrip().rstrip()) chars = [] elif n > 0: chars.append(c) return parts with open('lan sample text file.txt','r') as fd: #for words in fd.readlines(): t1 = balanced_braces(fd); print(t1) Output: ['"xE\'", PUT(xx.xxxx.),"\'"', '"TRUuuuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.", '"xE\'", PUT(xx.xxxx.),"\'"', '"CUuuiiiiuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."] 2nd attempt: from pyparsing import nestedExpr matchedParens = nestedExpr('(',')') with open('lan sample text file.txt','r') as fd: for words in fd.readlines(): for e in matchedParens.searchString(words): print(e) Output: [['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']] [['"TRUuuuth"']] [['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']] [['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']] [['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']] [['"CUuuiiiiuth"']] [['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']] [['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']] 3rd attempt: def parse_segments(source, recurse=False): unmatched_count = 0 start_pos = 0 opened = False open_pos = 0 cur_pos = 0 finished = [] segments = [] for character in source: #scan for mismatched parenthesis: if character == '(': unmatched_count += 1 if not opened: open_pos = cur_pos opened = True if character == ')': unmatched_count -= 1 if opened and unmatched_count == 0: segment = source[open_pos:cur_pos+1] segments.append(segment) clean = source[start_pos:open_pos] if clean: finished.append(clean) opened = False start_pos = cur_pos+1 cur_pos += 1 # assert unmatched_count == 0 if start_pos != cur_pos: #get anything that was left over here finished.append(source[start_pos:cur_pos]) #now check on recursion: for item in segments: #get rid of bounding parentheses: pruned = item[1:-1] if recurse: results = parse_tags(pruned, recurse) finished.expand(results) else: finished.append(pruned) return finished with open('lan sample text file.txt','r') as fd: for words in fd.readlines(): t = parse_segments(words) print(t) Output: ['kkkkk;\n'] ['\n'] [' select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"'] ['quit; \n'] ['\n'] ['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n'] ['proc sql; ', ';\n', '"TRUuuuth"'] ['hhhjhfjs as fdsjfsj:\n'] ['select * from djfkjd to jfkjs\n'] ['(\n'] ['SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n'] ['\tFROM &xxx..xxx_xxx_xxE\n'] ["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"] [' ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."] [' );\n'] ['\n'] ['\n'] ['jjjjjj;\n'] ['\n'] [' select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"'] ['quit; \n'] ['\n'] ['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n'] ['proc sql; ', ';\n', '"CUuuiiiiuth"'] ['hhhjhfjs as fdsjfsj:\n'] ['select * from djfkjd to jfkjs\n'] ['(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n'] ['\tFROM &xxx..xxx_xxx_xxE\n'] ["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"] [' ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."] [' );'] My intended Output that I am unable to get should look something like this: ("xE'", PUT(xx.xxxx.),"'") ("TRUuuuth") ( SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj FROM &xxx..xxx_xxx_xxE where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.)) ) ("xE'", PUT(xx.xxxx.),"'") ("CUuuiiiiuth") (SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj FROM &xxx..xxx_xxx_xxE where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( )) ) -- https://mail.python.org/mailman/listinfo/python-list