On 2010-02-24 20:19, Robert Kern wrote:
On 2010-02-24 13:09 PM, mk wrote:
On 2010-02-24 20:01, Robert Kern wrote:
I will repeat my advice to just use random.SystemRandom.choice() instead
of trying to interpret the bytes from /dev/urandom directly.

Oh I hear you -- for production use I would (will) certainly consider
this. However, now I'm interested in the problem itself: why is the damn
distribution not uniform?

You want "< 234", not "< 235". (234 % 26 == 0), so you get some extra 'a's.

Right, this explains the 'a' outlier. Fixed. But still:

import operator
import os
import random
import math

def rand_str_custom(n):
    s = os.urandom(n)
return ''.join([chr(ord('a') + ord(x) % 26) for x in s if ord(x) < 234])

def count_chars(chardict, word):
    for c in word:
        try:
            chardict[c] += 1
        except KeyError:
            chardict[c] = 0

def rand_str_SystemRandom_seeding(length):
    seed = os.urandom(32)
    random.seed(seed)
    prng = random.SystemRandom()
    chars = []
    for i in range(length):
        chars.append(prng.choice('abcdefghijklmnopqrstuvwxyz'))
    return ''.join(chars)

def rand_str_SystemRandom_noseeding(length):
    prng = random.SystemRandom()
    chars = []
    for i in range(length):
        chars.append(prng.choice('abcdefghijklmnopqrstuvwxyz'))
    return ''.join(chars)

def sd(x):
    sd.sum  += x
    sd.sum2 += x*x
    sd.n    += 1.0
    sum, sum2, n = sd.sum, sd.sum2, sd.n
    return math.sqrt(sum2/n - sum*sum/n/n)

def gen_rand_with_fun(fun):
    print fun.__name__
    chardict = {}
    for i in range(10000):
        w = fun(10)
        count_chars(chardict, w)
    counts = list(chardict.items())
    counts.sort(key = operator.itemgetter(1), reverse = True)
    nums = [c[1] for c in counts]
    sd.sum = sd.sum2 = sd.n = 0
    mean = (1.0*sum(nums))/len(nums)
    stddev = map(sd, nums)[-1]
    print 'mean', mean, 'std dev', stddev
    for char, count in counts:
print char, count, '%.2f' % ((count - mean)/stddev), 'std devs away from mean'

if __name__ == "__main__":
    gen_rand_with_fun(rand_str_SystemRandom_seeding)
    print
    gen_rand_with_fun(rand_str_SystemRandom_noseeding)
    print
    gen_rand_with_fun(rand_str_custom)




rand_str_SystemRandom_seeding
mean 3845.15384615 std dev 46.2016419186
l 3926 1.75 std devs away from mean
y 3916 1.53 std devs away from mean
d 3909 1.38 std devs away from mean
a 3898 1.14 std devs away from mean
p 3898 1.14 std devs away from mean
c 3889 0.95 std devs away from mean
u 3884 0.84 std devs away from mean
j 3873 0.60 std devs away from mean
n 3873 0.60 std devs away from mean
w 3866 0.45 std devs away from mean
x 3863 0.39 std devs away from mean
r 3855 0.21 std devs away from mean
m 3852 0.15 std devs away from mean
b 3841 -0.09 std devs away from mean
t 3835 -0.22 std devs away from mean
o 3829 -0.35 std devs away from mean
k 3827 -0.39 std devs away from mean
i 3821 -0.52 std devs away from mean
s 3812 -0.72 std devs away from mean
q 3806 -0.85 std devs away from mean
v 3803 -0.91 std devs away from mean
g 3799 -1.00 std devs away from mean
h 3793 -1.13 std devs away from mean
e 3782 -1.37 std devs away from mean
f 3766 -1.71 std devs away from mean
z 3758 -1.89 std devs away from mean

rand_str_SystemRandom_noseeding
mean 3845.15384615 std dev 55.670522726
i 3961 2.08 std devs away from mean
r 3911 1.18 std devs away from mean
e 3910 1.16 std devs away from mean
m 3905 1.08 std devs away from mean
a 3901 1.00 std devs away from mean
u 3893 0.86 std devs away from mean
t 3882 0.66 std devs away from mean
w 3872 0.48 std devs away from mean
s 3870 0.45 std devs away from mean
c 3868 0.41 std devs away from mean
n 3866 0.37 std devs away from mean
q 3865 0.36 std devs away from mean
k 3863 0.32 std devs away from mean
y 3848 0.05 std devs away from mean
j 3836 -0.16 std devs away from mean
v 3830 -0.27 std devs away from mean
f 3829 -0.29 std devs away from mean
z 3829 -0.29 std devs away from mean
g 3827 -0.33 std devs away from mean
l 3818 -0.49 std devs away from mean
b 3803 -0.76 std devs away from mean
d 3803 -0.76 std devs away from mean
p 3756 -1.60 std devs away from mean
x 3755 -1.62 std devs away from mean
h 3744 -1.82 std devs away from mean
o 3729 -2.09 std devs away from mean

rand_str_custom
mean 3517.15384615 std dev 40.7541336343
i 3586 1.69 std devs away from mean
a 3578 1.49 std devs away from mean
e 3575 1.42 std devs away from mean
m 3570 1.30 std devs away from mean
q 3562 1.10 std devs away from mean
c 3555 0.93 std devs away from mean
g 3552 0.86 std devs away from mean
w 3542 0.61 std devs away from mean
p 3536 0.46 std devs away from mean
x 3533 0.39 std devs away from mean
s 3528 0.27 std devs away from mean
o 3524 0.17 std devs away from mean
d 3516 -0.03 std devs away from mean
t 3515 -0.05 std devs away from mean
h 3511 -0.15 std devs away from mean
v 3502 -0.37 std devs away from mean
z 3502 -0.37 std devs away from mean
b 3500 -0.42 std devs away from mean
f 3496 -0.52 std devs away from mean
u 3492 -0.62 std devs away from mean
l 3486 -0.76 std devs away from mean
r 3478 -0.96 std devs away from mean
n 3476 -1.01 std devs away from mean
j 3451 -1.62 std devs away from mean
k 3450 -1.65 std devs away from mean
y 3430 -2.14 std devs away from mean


It would appear that SystemRandom().choice is indeed best (in terms of how much the counts stray from mean in std devs), but only after seeding it with os.urandom.


Regards,
mk


--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to