a four-key input method

Kragen Sitaker Wed, 22 Jan 2003 00:40:02 -0800

Here's a method for inputting text with four keys: the digits 1, 2, 3,
and the backspace key.  It uses experimental probabilities of word
occurrence (stored in the file ".wordlist") to speed up input.  By
default, it generates some default SWAG probabilities from the lengths
of the words in /usr/share/dict/words; it worked a lot better for me
when I merged data into the .wordlist file from much of my recent
email, by the following means.


cat priv/mail/outmail[efgh]* | tr -c 'a-zA-Z' '^V^J' | sort | uniq -c | awk '{print 
$2, $1}' >x
cat .wordlist x | sort -f > .wrodlist.new
mv .wrodlist.new .wordlist

Someone somewhere claimed that three is, in some sense, the optimal
number of alternatives for things like this; the time to navigate to
your goal is jointly proportional to the number of points at which you
must make a choice and the number of alternatives you must examine at
each point; the number of points at which you must make a choice is
roughly the logarithm of the total number of choices (e.g. the number
of words in English) to the base of the number of choices at each
point.  Taking N as the total number of choices and n as the number at
each point, the number of times you need to make a choice is
ln(N)/ln(n), and the total time to pick one of the N items is 
(n ln(N)/ln(n)).  I can't differentiate that at 12:30 AM,
unfortunately, but I'm told it has a minimum at n=e, and is smaller at
n=3 than where n is any other whole number.  That's why I picked three
alternatives.

Even with the probabilities from my old email, I can't input text
nearly as fast with this method as I can with, say, Tegic T9.  I tried
a certain short phrase; keyboarding it with QWERTY, it took me about 3
seconds; with T9, I entered it in 25 seconds; with the traditional
keypad entry "press 2 once for A, twice for B, three times for C", it
took 37 seconds; with this 4-key method, it took me 148 seconds with
the probabilities guessed from /usr/share/dict/words and 116 seconds
with word probabilities from my email.  Conditional probabilities
based on previous words might help some.

#!/usr/bin/python
import os, sys, random, bisect, math

def chomp(line):
    if line.endswith('\n'): return chomp(line[:-1])
    return line

def make_initial_wordlist(filename):
    # what initial distribution should we assume?  Presumably English
    # is pretty well huffman-coded, so a word's frequency should
    # decrease exponentially with its length.  I don't know the base
    # of the exponent, but some random web page asserts that English
    # has about 2.62 bits per letter, so I guesstimate that a word's
    # frequency should be proportional to 2^(-2.62 * len), or
    # (2^-2.62)^len, or e^((ln 2) * -2.62 * len).
    infile = open('/usr/share/dict/words')
    filename_new = "%s.new" % filename
    outfile = open(filename_new, 'w')
    freqfudgefactor = math.log(2) * -2.62
    for line in infile.xreadlines():
        word = chomp(line)
        freqguess = math.exp(freqfudgefactor * len(word)) * 200
        outfile.write('%s %.10f\n' % (word, freqguess))
    outfile.close()
    os.rename(filename_new, filename)  # only works in Unix!

def read_wordlist():
    class wordlist:
        def __init__(self, filename):
            self.filename = filename
            infile = open(filename)
            lastword = None
            self.words = []
            self.probs = []
            self.cumprobs = [0]
            for line in infile.xreadlines():
                word, prob = line.split()
                prob = float(prob)
                if word != lastword:
                    self.words.append(word)
                    self.probs.append(prob)
                    self.cumprobs.append(self.cumprobs[-1] + prob)
                else:  # dup
                    self.probs[-1] += prob
                    self.cumprobs[-1] += prob
                lastword = word
            del self.cumprobs[0:1]
            infile.close()
        def start(self): return 0
        def end(self): return len(self.words)
        def get(self, ii):
            self.probs[ii] += 1
            return self.words[ii]
        def write(self):
            filename_new = '%s.new' % self.filename
            outfile = open(filename_new, 'w')
            for ii in xrange(self.end()):
                outfile.write('%s %.10f\n' % (self.words[ii], self.probs[ii]))
            outfile.close()
            os.rename(filename_new, self.filename)
        def divide(self, start, end):
            startprob = self.cumprobs[start]
            endprob = self.cumprobs[end]
            first = bisect.bisect_left(self.cumprobs,
                                       (startprob + startprob + endprob)/3.0,
                                       start, end)
            second = bisect.bisect_left(self.cumprobs,
                                        (startprob + endprob + endprob)/3.0,
                                        start, end)
            # this is a dumb kludge:
            if second == end and end - start > 1:
                second -= 1
            if first >= second:
                first = second
                if first == second and second - start > 1:
                    first -= 1
            return first, second
    filename = '.wordlist'
    if not os.path.exists(filename):
        print "initializing wordlist..."
        make_initial_wordlist(filename)
    return wordlist(filename)

class cursor:
    def __init__(self, wordlist):
        self.stack = [(wordlist.start(), wordlist.end()-1)]
        self.wordlist = wordlist
    def _opts(self):
        start, end = self.stack[-1]
        first, second = self.wordlist.divide(start, end)
        return [start, first, second, end]
    def word(self, ii):
        return self.wordlist.words[ii]
    def options(self):
        return '[%s 1 %s 2 %s 3 %s]' % tuple(map(self.word, self._opts()))
    def shrink(self, whichpart):
        opts = self._opts()
        self.stack.append((opts[whichpart-1], opts[whichpart]))
    def expand(self):
        if len(self.stack) > 1: self.stack.pop()
    def done(self):
        opts = self._opts()
        return opts[0] == opts[-1]
    def get(self):
        return self.wordlist.get(self.stack[-1][0])
    def dump(self):
        opts = self._opts()
        return '%s %s %s' % (opts, map(self.word, opts),
                             [self.wordlist.cumprobs[ii] for ii in opts])

def main():
    print "reading wordlist..."
    wordlist = read_wordlist()
    os.system('stty cbreak -echo')
    backspace, delete = '\b', '\177'
    try:
        while 1:
            c = cursor(wordlist)
            msg = ''
            while not c.done():
                msg += c.options()
                sys.stdout.write(msg)
                char = sys.stdin.read(1)
                sys.stdout.write((backspace + ' ' + backspace) * len(msg))
                msg = ''
                if char in '123':
                    c.shrink(int(char))
                elif char in [backspace, delete]:
                    c.expand()
                elif char == '?':
                    msg = c.dump() + ' '
                else:
                    msg = "unknown char '" + char + "' "
            sys.stdout.write(c.get() + ' ')
    finally:
        sys.stdout.write("exiting...")
        sys.stdout.flush()
        os.system('stty -cbreak echo')
        wordlist.write()

if __name__ == '__main__': main()


-- 
<[EMAIL PROTECTED]>       Kragen Sitaker     <http://www.pobox.com/~kragen/>
Edsger Wybe Dijkstra died in August of 2002.  The world has lost a great
man.  See http://advogato.org/person/raph/diary.html?start=252 and
http://www.kode-fu.com/geek/2002_08_04_archive.shtml for details.

a four-key input method

Reply via email to