Chris Angelico wrote: > On Wed, Sep 5, 2012 at 6:29 PM, Peter Otten <__pete...@web.de> wrote: >> comparing every pair in a sample of 1000 8-char words >> taken from '/usr/share/dict/words' >> >> head >> 1: 477222 ************************************************************ >> 2: 18870 ** >> ...
tail 1: 386633 ************************************************************ 2: 74966 *********** > Not understanding this. What are the statistics, I measured how many chars they have in common for all combinations of 1000 words taken from /usr/share/dict/words. 477222 pairs have one char in common, 18870 pairs have two chars in common when compared from the *beginning* of the string. 386633 pairs have one char in common, 74966 pairs have two chars in common when compared from the *end* of the string. and what (if it's not obvious from the previous answer) do they prove? They demonstrate that for two words from that particular corpus it is likely that a[::-1] == b[::-1] has to take more (1.34 versus 1.05 on average) characters into account than a == b, i. e. comparing from the back should be slower rather than faster. If that doesn't help, here's the code ;) import random import itertools from contextlib import contextmanager from collections import defaultdict @contextmanager def open_corpus(args): with open(args.name) as instream: words = (line.strip() for line in instream) #words = (word for word in words if not word.endswith("'s")) yield words def pick(items, count): items = list(items) return random.sample(items, count) def count_common(a, b): i = 0 for i, (x, y) in enumerate(zip(a, b), 1): if x != y: break return i def corpus(args): with open_corpus(args) as words: wordlens = (len(line.strip()) for line in words) freq = defaultdict(int) for wordlen in wordlens: freq[wordlen] += 1 show_histogram(freq) def show_histogram(freq): peak = max(freq.values()) freq_width = len(str(peak)) max_freq = max(freq) len_width = len(str(max_freq)) for i in range(min(freq), max_freq+1): value = freq[i] print("{:{}}: {:{}} {}".format( i, len_width, value, freq_width, "*" * (60 * value // peak))) def compare(args): with open_corpus(args) as words: words = pick( (word for word in words if len(word) == args.word_len), args.sample_size) n = 0 head_common = defaultdict(int) tail_common = defaultdict(int) n_tail = n_head = 0 for n, (a, b) in enumerate(itertools.combinations(words, 2), 1): cc = count_common(a, b) n_head += cc head_common[cc] += 1 ccr = count_common(a[::-1], b[::-1]) n_tail += ccr tail_common[ccr] += 1 print(n, "combinations") print("average common chars (head) {:.3}".format(n_head/n)) print("average common chars (tail) {:.3}".format(n_tail/n)) print("comparing every pair in a " "sample of {sample} {len}-char words\n" "taken from {name!r}".format( sample=args.sample_size, len=args.word_len, name=args.name)) print() print("head") show_histogram(head_common) print() print("tail") show_histogram(tail_common) def main(): import argparse parser = argparse.ArgumentParser() parsers = parser.add_subparsers() pcorpus = parsers.add_parser("corpus") pcorpus.add_argument("name") pcorpus.set_defaults(func=corpus) pcompare = parsers.add_parser("compare") pcompare.add_argument("name") pcompare.add_argument("--sample-size", type=int, default=1000) pcompare.add_argument("--word-len", type=int, default=8) pcompare.set_defaults(func=compare) args = parser.parse_args() args.func(args) if __name__ == "__main__": main() -- http://mail.python.org/mailman/listinfo/python-list