> Is there a reason to prefer one over the other? Is one > faster? I compiled my regular expression to make it quicker.
With Python 2.4 I get these results (all imports are factored out, all give the same result except for CSV which strips the "s) with timeit.py: Own split: 26.8668364275 Tokenize: 78.8295112926 Rejoin: 11.237671827 Re: 13.9386123097 Re compiled: 8.19355839918 CSV: 23.3710904598 Of course, speed isn't everything (or you wouldn't be using Python). Readability is probably the most important factor - I'd say that a re (particularly a verbose re) would be the most readable, followed by using the CSV module, followed by writing your own split function. Since re is also the fastest of the methods suggested so far, it seems like a good choice. > What a rich language! So many choices. Somewhat ironically, one of the tenets of Python is "there should be one-- and preferably only one --obvious way to do it." (type "import this" at an interactive prompt). =Tony.Meyer
import re import csv import tokenize from StringIO import StringIO def split_with_csv(s): input = StringIO(s) return csv.reader(input, delimiter=' ').next() r = re.compile(r'\".*\"|[^ ]+') def split_with_re_compiled(s): return r.findall(s) def split_with_re(s): return re.findall(r'\".*\"|[^ ]+', s) def split_with_tokenize(s): results = [tokenTuple[1] for tokenTuple in tokenize.generate_tokens(StringIO(s).readline)] return results[:-1] def split_and_rejoin(s): combined = [] b = [] in_quotes = False for a in s.split(): if '"' in a and in_quotes: combined.append(a) b.append(" ".join(combined)) combined = [] in_quotes = False continue elif '"' in a and not in_quotes: in_quotes = True if in_quotes: combined.append(a) else: b.append(a) return b def split_no_quotes(s): index_start = 0 index_end = 0 in_quotes = False result = [] while index_end < len(s): if s[index_end] == '"': in_quotes = not in_quotes if s[index_end] == ' ' and not in_quotes: result.append(s[index_start:index_end]) index_start = index_end + 1 index_end += 1 if s[-1] != ' ': result.append(s[index_start:index_end]) return result if __name__ == "__main__": import timeit t = timeit.Timer("temp3.split_no_quotes('Hi \"Python Tutors\" please help')", "import temp3") print "No quotes", t.timeit() t = timeit.Timer("temp3.split_with_tokenize('Hi \"Python Tutors\" please help')", "import temp3") print "Tokenize", t.timeit() t = timeit.Timer("temp3.split_and_rejoin('Hi \"Python Tutors\" please help')", "import temp3") print "Rejoin", t.timeit() t = timeit.Timer("temp3.split_with_re('Hi \"Python Tutors\" please help')", "import temp3") print "Re", t.timeit() t = timeit.Timer("temp3.split_with_re_compiled('Hi \"Python Tutors\" please help')", "import temp3") print "Re compiled", t.timeit() t = timeit.Timer("temp3.split_with_csv('Hi \"Python Tutors\" please help')", "import temp3") print "CSV", t.timeit() t = timeit.Timer("temp3.split_no_quotes('This will not work as \"more than two words\" are quoted')", "import temp3") print "No quotes", t.timeit() t = timeit.Timer("temp3.split_with_tokenize('This will not work as \"more than two words\" are quoted')", "import temp3") print "Tokenize", t.timeit() t = timeit.Timer("temp3.split_and_rejoin('This will not work as \"more than two words\" are quoted')", "import temp3") print "Rejoin", t.timeit() t = timeit.Timer("temp3.split_with_re('This will not work as \"more than two words\" are quoted')", "import temp3") print "Re", t.timeit() t = timeit.Timer("temp3.split_with_re_compiled('This will not work as \"more than two words\" are quoted')", "import temp3") print "Re compiled", t.timeit() t = timeit.Timer("temp3.split_with_csv('This will not work as \"more than two words\" are quoted')", "import temp3") print "CSV", t.timeit()
_______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor