My version, not much tested. It probably doesn't work well for tables
with few rows. It finds the most frequent word beginnings, and then
splits the data according to them.

data = """\
44544      ipod          apple     black         102
GFGFHHF-12 unknown thing bizar     brick mortar  tbc
45fjk      do not know   + is less               biac
           disk          seagate   250GB         130
5G_gff                   tbd       tbd
gjgh88hgg  media record  a and b                 12
hjj        foo           bar       hop           zip
hg uy oi   hj uuu ii a   qqq ccc v ZZZ Ughj
qdsd       zert                    nope          nope
"""

import re, pprint
# import collections # For Python 2.5

# RE to find the beginning of words
tpatt = re.compile(r"\b[^ ]")

# Remove empty lines
lines = filter(None, data.splitlines())

# Find the positions of all word beginnings
# This finds:  treshs = [0, 11, 25, 35, 49, ...
# 44544      ipod          apple     black         102
# ^          ^             ^         ^             ^
treshs = [ob.start() for li in lines for ob in tpatt.finditer(li)]

# Find treshs frequences
freqs = {}
for el in treshs:
    freqs[el] = freqs.get(el, 0) + 1

# Find treshs frequences, alternative for Python V.2.5
# freqs = collections.defaultdict(int)
# for el in treshs:
#     freqs[el] += 1

# Find a big enough frequence
bigf = max(freqs.itervalues()) * 0.6

# Find the most common column beginnings
cols = sorted(k for k,v in freqs.iteritems() if v>bigf)

def xpairs(alist):
    "xpairs(xrange(n)) ==> (0,1), (1,2), (2,3), ..., (n-2, n-1)"
    for i in xrange(len(alist)-1):
        yield alist[i:i+2]

result = [[li[x:y].strip() for x,y in xpairs(cols+[None])] for li in
lines]

print data
pprint.pprint(result)


"""
Output:

44544      ipod          apple     black         102
GFGFHHF-12 unknown thing bizar     brick mortar  tbc
45fjk      do not know   + is less               biac
           disk          seagate   250GB         130
5G_gff                   tbd       tbd
gjgh88hgg  media record  a and b                 12
hjj        foo           bar       hop           zip
hg uy oi   hj uuu ii a   qqq ccc v ZZZ Ughj
qdsd       zert                    nope          nope

[['44544', 'ipod', 'apple', 'black', '102'],
 ['GFGFHHF-12', 'unknown thing', 'bizar', 'brick mortar', 'tbc'],
 ['45fjk', 'do not know', '+ is less', '', 'biac'],
 ['', 'disk', 'seagate', '250GB', '130'],
 ['5G_gff', '', 'tbd', 'tbd', ''],
 ['gjgh88hgg', 'media record', 'a and b', '', '12'],
 ['hjj', 'foo', 'bar', 'hop', 'zip'],
 ['hg uy oi', 'hj uuu ii a', 'qqq ccc v', 'ZZZ Ughj', ''],
 ['qdsd', 'zert', '', 'nope', 'nope']]
"""

Bye,
bearophile

-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to