I posted the first part of this before as http://lists.canonical.org/pipermail/kragen-hacks/2002-January/000310.html --- this renders it useful. I wrote it tonight, starting from that post.
Obviously it could be faster, prettier, and more flexible, and it contains a routine called "fwiffle", but it works under Python 1.5 and 2.1, and it's useful. My sample data file was an HTTP log. Here's the CGI script: #!/usr/bin/python import sys sys.path.insert(0, '/home/kragen/devel/oerlap') import oerlapcgi, string oerlapcgi.debug = 1 oerlapcgi.oerlapcgi('/tmp/lists-log', string.split('client user group date timezone method ' + 'URL version result nbytes referer'), datatitle="HTTP logs", defaulthide=string.split('user group timezone version')) This imports oerlapcgi.py, which follows. # TO DO: # add output size limits by default, plus ways of increasing them. # more hierarchical stuff: /~kragen, /~kragen/sw, etc. # add gzip content-transfer-encoding for clients that support it. It would # improve response time by about a factor of 20. # add an obvious way to remove restrictions, i.e. widen the filter. # add an obvious way to hide columns. # add a way to hide columns that aren't hidden by default. # make ... clickable to see the full extent of the hideousness # make the 'more' indicator clickable. # add pretty colors # add pretty fonts # add ability to sort by some columns # add totals to all-numeric or mostly-numeric columns import oerlap, cgi, string, sys, os, urllib debug = 0 def row(items, celltype='td'): rv = (['<tr>'] + map(lambda s, c=celltype: "<%s>%s</%s>" % (c, s, c), items) + ['</tr>\n']) return string.join(rv, '') def userval(somestring): somestring = str(somestring) if len(somestring) > 40: somestring = somestring[:40] + '...' return cgi.escape(somestring, 1) def andlist(alist): assert len(alist) > 0 if len(alist) == 1: return alist[0] elif len(alist) == 2: return "%s and %s" % tuple(alist) else: return "%s, and %s" % (string.join(alist[:-1], ", "), alist[-1]) def title(datatitle, colnames, paramdict): rv = cgi.escape(datatitle) brokenout = [] for name in colnames: if breakoutby(name, paramdict): brokenout.append(name) if len(brokenout) == 0: return rv else: return "%s by %s" % (rv, andlist(brokenout)) def describe_filters(colnames, paramdict): filters = [] for name in colnames: value = filterby(name, paramdict) if value: filters.append((name, value)) if filters == []: return "All input rows selected." else: return "Only input rows where %s are selected." % andlist( map(lambda (name, value): "%s is %s" % (name, repr(userval(value))), filters)) def describe_hides(colnames, paramdict, defaulthides): hidden = filter(lambda name, p=paramdict, d=defaulthides: hidden(name, p, d), colnames) if hidden == []: return "All columns displayed." else: return "The following columns are hidden: %s." % andlist( map(lambda name, p=paramdict: '<a href="%s">%s</a>' % (unhideurl(name, p), name), hidden)) def render_nrows(nrows): if nrows == 0: return "No rows" elif nrows == 1: return "One row" else: return "%d rows" % nrows def breakoutbyurl(colname, paramdict): paramdict = paramdict.copy() fieldname = 'bb_%s' % colname if '1' in paramdict.get(fieldname, []): del paramdict[fieldname] else: paramdict[fieldname] = ['1'] # I'm kind of dubious about this. It means that you can turn off # filtering by a particular field by clicking on the header for that # field, and you can hide a field (if it would be hidden by default) # by doing that twice. That seems kind of nonobvious and possibly # surprising. for param in ['show_' + colname, 'f_' + colname]: if paramdict.has_key(param): del paramdict[param] return urlencode(paramdict) def urlencode(paramdict): # We can't rely on urllib.urlencode to be sane in Python 1.5 --- it # doesn't accept a list of tuples, so doesn't handle multiple values. rv = [] for key, value in paramdict.items(): key = urllib.quote_plus(str(key)) for item in value: item = urllib.quote_plus(str(item)) rv.append(key + '=' + item) return cgiurl() + '?' + string.join(rv, '&') def breakoutby(colname, paramdict): return '1' in paramdict.get('bb_%s' % colname, []) def filterurl(fields, fieldnames, values, paramdict): paramdict = paramdict.copy() for ii in range(len(fields)): fieldname = fieldnames[fields[ii]] paramdict['f_%s' % fieldname] = [values[ii]] for param in ['bb_' + fieldname, 'show_' + fieldname]: if paramdict.has_key(param): del paramdict[param] return urlencode(paramdict) def filterby(colname, paramdict): return paramdict.get('f_%s' % colname, [None])[0] def unhideurl(colname, paramdict): paramdict = paramdict.copy() paramdict['show_' + colname] = [1] return urlencode(paramdict) def hidden(colname, paramdict, defaulthides): # hiding things you're filtering or breaking out by is confusing. return (colname in defaulthides and '1' not in paramdict.get("show_%s" % colname, []) and not filterby(colname, paramdict) and not breakoutby(colname, paramdict)) def cgiurl(): "Return a URL likely to refer to this script, without parameters." # logic copied from CGI.pm 2.46, minus some features and bugs getenv = os.environ.get port = int(getenv('SERVER_PORT', '80')) # default is for debugging if (getenv('HTTPS') == 'ON' or port == 443): protocol = "https" else: protocol = "http" if (protocol == 'http' and port == 80 or protocol == 'https' and port == 443): portstr = "" else: portstr = ":" + str(port) hostname = getenv('HTTP_HOST', getenv('SERVER_NAME', 'localhost')) path = getenv('SCRIPT_NAME', sys.argv[0]) return protocol + "://" + hostname + portstr + path def fwiffle(filename, fieldtitles, output, paramdict, datatitle="data", defaulthide=[]): mytitle = title(datatitle, fieldtitles, paramdict) output(('Content-Type: text/html\n\n<html><head><title>%s</title>\n' + '</head><body><h1>%s</h1>\n') % (mytitle, mytitle)) if debug: output('%s' % paramdict) try: bocols = [] filters = [] # input row selection criteria hiddencols = [] for ii in range(len(fieldtitles)): colname = fieldtitles[ii] if breakoutby(colname, paramdict): bocols.append(ii) if filterby(colname, paramdict) is not None: filters.append((ii, filterby(colname, paramdict))) if hidden(colname, paramdict, defaulthide): hiddencols.append(ii) datasrc = oerlap.filterdata(oerlap.filelines(open(filename)), filters) results, freqs, n = oerlap.oerlap(datasrc, bocols) output('<p>%s %s</p>' % (describe_filters(fieldtitles, paramdict), describe_hides(fieldtitles, paramdict, defaulthide))) output('<p>%s selected from input. %s in this summary.</p>\n' % (render_nrows(n), render_nrows(len(results)))) output('<table border>\n') headers = map(lambda ss, paramdict=paramdict, fieldtitles=fieldtitles: '<a href="%s">%s</a>' % (breakoutbyurl(ss, paramdict), cgi.escape(ss)), filter(lambda name, p=paramdict, d=defaulthide: not hidden(name, p, d), fieldtitles)) output(row(['N'] + headers, 'th')) for eachkey in oerlap.sort(freqs): if (len(results)) == 1: zoom = '' else: zoom = ('<a href="%s">(zoom)</a>' % filterurl(bocols, fieldtitles, eachkey, paramdict)) cells = ['<p align="center">%s<br />%s</p>' % (str(freqs[eachkey]), zoom)] for ii in range(len(results[eachkey])): if ii in hiddencols: continue frequencies = results[eachkey][ii] mystr = ['<table width="100%">'] maxn = 3 for eachitem in oerlap.sort(frequencies)[:maxn]: mystr.append(('<tr><td>%s</td>' + '<td align="right">%s</td></tr>\n') % (userval(eachitem), frequencies[eachitem])) if len(frequencies) > maxn: mystr.append('<tr><td colspan="2" align="center">' + '(%d more)</td></tr>\n' % (len(frequencies) - maxn)) cells.append(string.join(mystr + ['</table>'], '')) output(row(cells, 'td valign="top"')) finally: output('</table></body></html>\n') def oerlapcgi(filename, fieldtitles, datatitle="data", defaulthide=[]): "Provides a cgi veneer over fwiffle." import os if os.environ.has_key("GATEWAY_INTERFACE"): try: import cgitb sys.excepthook = cgitb.excepthook except ImportError: # guess they don't have cgitb (http://web.lfw.org/python/). # Oh well. Losers. Guess they have lots of time to debug stuff. pass fwiffle(filename, fieldtitles, sys.stdout.write, cgi.parse(), datatitle=datatitle, defaulthide=defaulthide) That imports oerlap.py, which follows: # incredibly powerful secret web log analysis tool import string def oerlap(datasrc, breakoutby): """Analyze data. Given a data source that yields tuples or None when .next() is called, and a sequence 'breakoutby' that specifies which fields of the tuples to break out by, count frequencies. Result is a dict; keys are tuples of values things are broken out by; values are lists of dicts mapping keys to frequencies. """ results = {} freqs = {} nn = 0 while 1: line = datasrc.next() if line is None: return results, freqs, nn nn = nn + 1 key = tuple(map(lambda f, line=line: line[f], breakoutby)) if not results.has_key(key): results[key] = map(lambda x: {}, range(len(line))) r = results[key] freqs[key] = freqs.get(key, 0) + 1 if len(r) < len(line): r.extend([{}] * (len(line) - len(r))) for dict, value in map(None, r, line): dict[value] = dict.get(value, 0) + 1 def sort(freqs): """Returns keys of a hash results sorted descending by their values. Useful for the freqs result of oerlap or for the individual items within its results result. """ rv = map(lambda (key, value): (value, key), freqs.items()) rv.sort() rv.reverse() return map(lambda item: item[1], rv) class filterdata: "Return only data items matching a filter." def __init__(self, datasource, filter): self.datasource = datasource self.filter = filter def next(self): while 1: next = self.datasource.next() if next is None: return None for field, value in self.filter: if next[field] != value: break else: return next class filelines: "Return lines from a file." def __init__(self, somefile): self.file = somefile def next(self): line = self.file.readline() if line == "": return None return tuple(map(lambda x: intern(x), string.split(line))) class arrayitems: "For testing. Return tuples from an array." def __init__(self, somearray): self.array = somearray self.ii = 0 def next(self): if self.ii == len(self.array): return None try: return self.array[self.ii] finally: self.ii = self.ii + 1 testdata = [('a', 1, 32), ('a', 1, 33), ('b', 1, 31), ('c', 2, 30), ('a', 0, 30)] def test(bb=[]): return oerlap(arrayitems(testdata), bb) -- <[EMAIL PROTECTED]> Kragen Sitaker <http://www.pobox.com/~kragen/> The sages do not believe that making no mistakes is a blessing. They believe, rather, that the great virtue of man lies in his ability to correct his mistakes and continually make a new man of himself. -- Wang Yang-Ming