My version, not much tested. It probably doesn't work well for tables with few rows. It finds the most frequent word beginnings, and then splits the data according to them.
data = """\ 44544 ipod apple black 102 GFGFHHF-12 unknown thing bizar brick mortar tbc 45fjk do not know + is less biac disk seagate 250GB 130 5G_gff tbd tbd gjgh88hgg media record a and b 12 hjj foo bar hop zip hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj qdsd zert nope nope """ import re, pprint # import collections # For Python 2.5 # RE to find the beginning of words tpatt = re.compile(r"\b[^ ]") # Remove empty lines lines = filter(None, data.splitlines()) # Find the positions of all word beginnings # This finds: treshs = [0, 11, 25, 35, 49, ... # 44544 ipod apple black 102 # ^ ^ ^ ^ ^ treshs = [ob.start() for li in lines for ob in tpatt.finditer(li)] # Find treshs frequences freqs = {} for el in treshs: freqs[el] = freqs.get(el, 0) + 1 # Find treshs frequences, alternative for Python V.2.5 # freqs = collections.defaultdict(int) # for el in treshs: # freqs[el] += 1 # Find a big enough frequence bigf = max(freqs.itervalues()) * 0.6 # Find the most common column beginnings cols = sorted(k for k,v in freqs.iteritems() if v>bigf) def xpairs(alist): "xpairs(xrange(n)) ==> (0,1), (1,2), (2,3), ..., (n-2, n-1)" for i in xrange(len(alist)-1): yield alist[i:i+2] result = [[li[x:y].strip() for x,y in xpairs(cols+[None])] for li in lines] print data pprint.pprint(result) """ Output: 44544 ipod apple black 102 GFGFHHF-12 unknown thing bizar brick mortar tbc 45fjk do not know + is less biac disk seagate 250GB 130 5G_gff tbd tbd gjgh88hgg media record a and b 12 hjj foo bar hop zip hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj qdsd zert nope nope [['44544', 'ipod', 'apple', 'black', '102'], ['GFGFHHF-12', 'unknown thing', 'bizar', 'brick mortar', 'tbc'], ['45fjk', 'do not know', '+ is less', '', 'biac'], ['', 'disk', 'seagate', '250GB', '130'], ['5G_gff', '', 'tbd', 'tbd', ''], ['gjgh88hgg', 'media record', 'a and b', '', '12'], ['hjj', 'foo', 'bar', 'hop', 'zip'], ['hg uy oi', 'hj uuu ii a', 'qqq ccc v', 'ZZZ Ughj', ''], ['qdsd', 'zert', '', 'nope', 'nope']] """ Bye, bearophile -- http://mail.python.org/mailman/listinfo/python-list