#!/usr/bin/python # -*- coding: utf-8 -*- """Identify words that are the same on a QWERTY keyboard modulo difference in hands.
The objective here is to analyze the feasibility of one-handed typing using T9-style disambiguation. That is, if we consider the two hands equivalent and just choose the most common 1-gram possibility for each word, how often will we choose a possibility the user didn't intend? And what are the most troublesome such possibilities? Results: > wess: will (253955) well (116604) oils (789) > tww: two (156114) too (70169) tow (363) yow (43) tww (32) yoo (6) > vade: make (79054) made (68515) vade (6) madi (6) > wwrsd: would (255198) world (59031) > gave: have (473693) gave (22946) game (14943) hame (8) > set: set (38769) let (25918) sit (8703) lit (1466) ley (173) sey (21) > sede: like (149479) side (33418) sidi (53) sede (12) > saw: saw (26734) law (25346) sao (182) lao (33) > sat: say (67862) sat (10950) lay (9690) lat (22) > swve: some (171368) love (20146) ... > 1538199 total errors in 90080933 words, 1.7% This makes me think: 1. It's probably a feasible alternative for full-speed English text, if you have a keybinding to correct a missed guess once or twice a minute; 2. 2-gram disambiguation would provide much better results, but would not make missed guesses insignificant and eliminate the need for a correction keybinding — 'make' and 'made'; 'set', 'let', and 'sit'; 'say' and 'lay'; and other similar cases; would still have significant missed-guess rates with any conceivable N-gram model over words. """ from __future__ import division import string keyboard = [('qwert', 'yuiop'), ('asdfg', 'hjkl;'), ('zxcvb', 'nm,./')] assert all(len(left_n) == len(right_n) for left_n, right_n in keyboard) left_side = ''.join(left for left, right in keyboard) right_side = ''.join(''.join(reversed(right)) for left, right in keyboard) translation = string.maketrans(right_side, left_side) classes = {} total_1grams = 0 for line in open('wordlist'): # wget http://canonical.org/~kragen/sw/wordlist freq, word = line.split() freq = int(freq) total_1grams += freq word_class = word.translate(translation) if word_class not in classes: classes[word_class] = [] classes[word_class].append((freq, word)) for word_class in classes: classes[word_class] = sorted(classes[word_class], reverse=True) errors = 0 errors_for = lambda fws: sum(freq for freq, word in fws[1:]) for word_class in sorted(classes.keys(), key=lambda word_class: errors_for(classes[word_class]), reverse=True): collisions = classes[word_class] if len(collisions) == 1: continue errors += errors_for(collisions) print "%s: %s" % (word_class, ' '.join('%s (%s)' % (word, freq) for freq, word in collisions)) print "%d total errors in %d words, %.2g%%" % (errors, total_1grams, errors / total_1grams * 100) -- To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks