oerlap: quick analysis of multivariate data

Kragen Sitaker Mon, 18 Feb 2002 00:22:21 -0800

I posted the first part of this before as
http://lists.canonical.org/pipermail/kragen-hacks/2002-January/000310.html
--- this renders it useful.  I wrote it tonight, starting from that post.


Obviously it could be faster, prettier, and more flexible, and it
contains a routine called "fwiffle", but it works under Python 1.5 and
2.1, and it's useful.  My sample data file was an HTTP log.

Here's the CGI script:

#!/usr/bin/python
import sys
sys.path.insert(0, '/home/kragen/devel/oerlap')
import oerlapcgi, string
oerlapcgi.debug = 1
oerlapcgi.oerlapcgi('/tmp/lists-log', 
                    string.split('client user group date timezone method ' +
                                 'URL version result nbytes referer'),
                    datatitle="HTTP logs",
                    defaulthide=string.split('user group timezone version'))


This imports oerlapcgi.py, which follows.

# TO DO:
# add output size limits by default, plus ways of increasing them.
# more hierarchical stuff: /~kragen, /~kragen/sw, etc.
# add gzip content-transfer-encoding for clients that support it.  It would
#     improve response time by about a factor of 20.
# add an obvious way to remove restrictions, i.e. widen the filter.
# add an obvious way to hide columns.
# add a way to hide columns that aren't hidden by default.
# make ... clickable to see the full extent of the hideousness
# make the 'more' indicator clickable.
# add pretty colors
# add pretty fonts
# add ability to sort by some columns
# add totals to all-numeric or mostly-numeric columns

import oerlap, cgi, string, sys, os, urllib

debug = 0

def row(items, celltype='td'):
    rv = (['<tr>'] +
          map(lambda s, c=celltype: "<%s>%s</%s>" % (c, s, c), items) +
          ['</tr>\n'])
    return string.join(rv, '')

def userval(somestring):
    somestring = str(somestring)
    if len(somestring) > 40: somestring = somestring[:40] + '...'
    return cgi.escape(somestring, 1)

def andlist(alist):
    assert len(alist) > 0
    if len(alist) == 1:
        return alist[0]
    elif len(alist) == 2:
        return "%s and %s" % tuple(alist)
    else:
        return "%s, and %s" % (string.join(alist[:-1], ", "), alist[-1])

def title(datatitle, colnames, paramdict):
    rv = cgi.escape(datatitle)
    brokenout = []
    for name in colnames:
        if breakoutby(name, paramdict):
            brokenout.append(name)
    if len(brokenout) == 0:
        return rv
    else:
        return "%s by %s" % (rv, andlist(brokenout))

def describe_filters(colnames, paramdict):
    filters = []
    for name in colnames:
        value = filterby(name, paramdict)
        if value:
            filters.append((name, value))
    if filters == []:
        return "All input rows selected."
    else:
        return "Only input rows where %s are selected." % andlist(
            map(lambda (name, value): "%s is %s" % (name,
                                                    repr(userval(value))),
                filters))

def describe_hides(colnames, paramdict, defaulthides):
    hidden = filter(lambda name, p=paramdict, d=defaulthides:
                    hidden(name, p, d), colnames)
    if hidden == []:
        return "All columns displayed."
    else:
        return "The following columns are hidden: %s." % andlist(
            map(lambda name, p=paramdict:
                '<a href="%s">%s</a>' % (unhideurl(name, p), name), hidden))

def render_nrows(nrows):
    if nrows == 0: return "No rows"
    elif nrows == 1: return "One row"
    else: return "%d rows" % nrows

def breakoutbyurl(colname, paramdict):
    paramdict = paramdict.copy()
    fieldname = 'bb_%s' % colname
    if '1' in paramdict.get(fieldname, []): del paramdict[fieldname]
    else: paramdict[fieldname] = ['1']
    # I'm kind of dubious about this.  It means that you can turn off
    # filtering by a particular field by clicking on the header for that
    # field, and you can hide a field (if it would be hidden by default)
    # by doing that twice.  That seems kind of nonobvious and possibly
    # surprising.
    for param in ['show_' + colname, 'f_' + colname]:
        if paramdict.has_key(param): del paramdict[param]
    return urlencode(paramdict)

def urlencode(paramdict):
    # We can't rely on urllib.urlencode to be sane in Python 1.5 --- it
    # doesn't accept a list of tuples, so doesn't handle multiple values.
    rv = []
    for key, value in paramdict.items():
        key = urllib.quote_plus(str(key))
        for item in value:
            item = urllib.quote_plus(str(item))
            rv.append(key + '=' + item)
    return cgiurl() + '?' + string.join(rv, '&')

def breakoutby(colname, paramdict):
    return '1' in paramdict.get('bb_%s' % colname, [])

def filterurl(fields, fieldnames, values, paramdict):
    paramdict = paramdict.copy()
    for ii in range(len(fields)):
        fieldname = fieldnames[fields[ii]]
        paramdict['f_%s' % fieldname] = [values[ii]]
        for param in ['bb_' + fieldname, 'show_' + fieldname]:
            if paramdict.has_key(param):
                del paramdict[param]
    return urlencode(paramdict)

def filterby(colname, paramdict):
    return paramdict.get('f_%s' % colname, [None])[0]

def unhideurl(colname, paramdict):
    paramdict = paramdict.copy()
    paramdict['show_' + colname] = [1]
    return urlencode(paramdict)

def hidden(colname, paramdict, defaulthides):
    # hiding things you're filtering or breaking out by is confusing.
    return (colname in defaulthides and
            '1' not in paramdict.get("show_%s" % colname, []) and
            not filterby(colname, paramdict) and
            not breakoutby(colname, paramdict))

def cgiurl():
    "Return a URL likely to refer to this script, without parameters."
    # logic copied from CGI.pm 2.46, minus some features and bugs
    getenv = os.environ.get
    port = int(getenv('SERVER_PORT', '80'))  # default is for debugging
    if (getenv('HTTPS') == 'ON' or port == 443):
        protocol = "https"
    else:        
        protocol = "http"
    if (protocol == 'http' and port == 80 or
        protocol == 'https' and port == 443):
        portstr = ""
    else:
        portstr = ":" + str(port)
    hostname = getenv('HTTP_HOST', getenv('SERVER_NAME', 'localhost'))
    path = getenv('SCRIPT_NAME', sys.argv[0])
    return protocol + "://" + hostname + portstr + path

def fwiffle(filename, fieldtitles, output, paramdict, datatitle="data",
            defaulthide=[]):
    mytitle = title(datatitle, fieldtitles, paramdict)
    output(('Content-Type: text/html\n\n<html><head><title>%s</title>\n' +
            '</head><body><h1>%s</h1>\n') % (mytitle, mytitle))
    if debug: output('%s' % paramdict)
    try:
        bocols = []
        filters = []  # input row selection criteria
        hiddencols = []
        for ii in range(len(fieldtitles)):
            colname = fieldtitles[ii]
            if breakoutby(colname, paramdict):
                bocols.append(ii)
            if filterby(colname, paramdict) is not None:
                filters.append((ii, filterby(colname, paramdict)))
            if hidden(colname, paramdict, defaulthide):
                hiddencols.append(ii)
        datasrc = oerlap.filterdata(oerlap.filelines(open(filename)), filters)
        results, freqs, n = oerlap.oerlap(datasrc, bocols)

        output('<p>%s %s</p>' % (describe_filters(fieldtitles, paramdict),
                                describe_hides(fieldtitles, paramdict,
                                               defaulthide)))
        output('<p>%s selected from input.  %s in this summary.</p>\n' %
               (render_nrows(n), render_nrows(len(results))))
        output('<table border>\n')
        headers = map(lambda ss, paramdict=paramdict, fieldtitles=fieldtitles:
                      '<a href="%s">%s</a>' % (breakoutbyurl(ss, paramdict),
                                               cgi.escape(ss)),
                      filter(lambda name, p=paramdict, d=defaulthide:
                             not hidden(name, p, d), fieldtitles))
        output(row(['N'] + headers, 'th'))

        for eachkey in oerlap.sort(freqs):
            if (len(results)) == 1:
                zoom = ''
            else:
                zoom = ('<a href="%s">(zoom)</a>' %
                        filterurl(bocols, fieldtitles, eachkey, paramdict))
            cells = ['<p align="center">%s<br />%s</p>'
                     % (str(freqs[eachkey]), zoom)]
            for ii in range(len(results[eachkey])):
                if ii in hiddencols: continue
                frequencies = results[eachkey][ii]
                mystr = ['<table width="100%">']
                maxn = 3
                for eachitem in oerlap.sort(frequencies)[:maxn]:
                    mystr.append(('<tr><td>%s</td>' +
                                 '<td align="right">%s</td></tr>\n') %
                                 (userval(eachitem), frequencies[eachitem]))
                if len(frequencies) > maxn:
                    mystr.append('<tr><td colspan="2" align="center">' +
                                 '(%d&nbsp;more)</td></tr>\n' %
                                 (len(frequencies) - maxn))
                cells.append(string.join(mystr + ['</table>'], ''))
            output(row(cells, 'td valign="top"'))
    finally:
        output('</table></body></html>\n')

def oerlapcgi(filename, fieldtitles, datatitle="data", defaulthide=[]):
    "Provides a cgi veneer over fwiffle."
    import os
    if os.environ.has_key("GATEWAY_INTERFACE"):
        try:
            import cgitb
            sys.excepthook = cgitb.excepthook
        except ImportError:
            # guess they don't have cgitb (http://web.lfw.org/python/).
            # Oh well.  Losers.  Guess they have lots of time to debug stuff.
            pass
    fwiffle(filename, fieldtitles, sys.stdout.write, cgi.parse(),
            datatitle=datatitle, defaulthide=defaulthide)


That imports oerlap.py, which follows:

# incredibly powerful secret web log analysis tool
import string

def oerlap(datasrc, breakoutby):
    """Analyze data.

    Given a data source that yields tuples or None when .next() is called,
    and a sequence 'breakoutby' that specifies which fields of the tuples to
    break out by, count frequencies.

    Result is a dict; keys are tuples of values things are broken out by;
    values are lists of dicts mapping keys to frequencies.

    """
    results = {}
    freqs = {}
    nn = 0
    while 1:
        line = datasrc.next()
        if line is None: return results, freqs, nn
        nn = nn + 1
        key = tuple(map(lambda f, line=line: line[f], breakoutby))
        if not results.has_key(key):
            results[key] = map(lambda x: {}, range(len(line)))
        r = results[key]
        freqs[key] = freqs.get(key, 0) + 1
        if len(r) < len(line): r.extend([{}] * (len(line) - len(r)))
        for dict, value in map(None, r, line):
            dict[value] = dict.get(value, 0) + 1

def sort(freqs):
    """Returns keys of a hash results sorted descending by their values.

    Useful for the freqs result of oerlap or for the individual items
    within its results result.

    """
    rv = map(lambda (key, value): (value, key), freqs.items())
    rv.sort()
    rv.reverse()
    return map(lambda item: item[1], rv)

class filterdata:
    "Return only data items matching a filter."
    def __init__(self, datasource, filter):
        self.datasource = datasource
        self.filter = filter
    def next(self):
        while 1:
            next = self.datasource.next()
            if next is None: return None
            for field, value in self.filter:
                if next[field] != value: break
            else:
                return next

class filelines:
    "Return lines from a file."
    def __init__(self, somefile):
        self.file = somefile
    def next(self):
        line = self.file.readline()
        if line == "": return None
        return tuple(map(lambda x: intern(x), string.split(line)))

class arrayitems:
    "For testing.  Return tuples from an array."
    def __init__(self, somearray):
        self.array = somearray
        self.ii = 0
    def next(self):
        if self.ii == len(self.array): return None
        try: return self.array[self.ii]
        finally: self.ii = self.ii + 1

testdata = [('a', 1, 32),
            ('a', 1, 33),
            ('b', 1, 31),
            ('c', 2, 30),
            ('a', 0, 30)]

def test(bb=[]): return oerlap(arrayitems(testdata), bb)




-- 
<[EMAIL PROTECTED]>       Kragen Sitaker     <http://www.pobox.com/~kragen/>
The sages do not believe that making no mistakes is a blessing. They believe, 
rather, that the great virtue of man lies in his ability to correct his 
mistakes and continually make a new man of himself.  -- Wang Yang-Ming

oerlap: quick analysis of multivariate data

Reply via email to