I started on a project to correct all of the instances of "prandom_u32() % FOO" in the kernel (there are lots) to "prandom_u32_max(FOO)".
This has snowballed into an ongoing audit of random number use in the kernel, including uses of get_random_bytes() when get_random_u32 or prandom_u32() would suffice. But one of the more potentially-controversial changes in the series, which might benefit from more discussion time, is a change of the actual prandom_u32() generator itself. The current lfsr113 generator isn't bad, but like all LFSRs, it fails linear complexity tests in suites like testU01 and PractRand. And there are generators with the same 128-bit state which produce better output in about half the time. So I figured as long as I was neck-deep in the code, I might as well tweak that, too. Thw ones that seem interesting to me are: - Chris Doty-Humphrey's sfc32. This is a 96-bit chaotic generator (meaning period *probably* long but not well defined) fed with a 32-bit counter to ensure a minimum period. It's extremely fast, and the author is also the author of PractRand, so it's well-tested. - Vigna and Bacman's xoshiro128**. This is a 128-bit LFSR with some output postprocessing. - (on 64-bit machines) xoroshiro128**, by the same authors. This is only efficient on 64-bit machines, so it would need a have a 32-bit backup. - Bob Jenkins' jsf (originally "flea"). 128 bits, good mixing, fully chaotic. I prefer the safety of a guaranteed minimum period, but this is well thought of. - A lag-3 mutiply-with-carry generator. 2^32 - 1736 is the largest "safe prime" mutiplier. I'm currently planning on using the first. It also has the advantage for lockless reseeding that there are no bad states to avoid. Some discussion of most of these at http://www.pcg-random.org/posts/some-prng-implementations.html Here are some timing numbers. Clock cycles per 1e7 iterations, sp 8-19 cycles per iteration. Core 2: lcg32: cycles 89018580 lcg64: cycles 130782048 mwc: cycles 90385992 prandom_u32: cycles 168780348 xoshiro128ss: cycles 90331056 xoroshiro128ss: cycles 106012008 sfc32: cycles 90318276 jsf32: cycles 90364464 jsf32a: cycles 100382724 gjrand: cycles 131713680 Opteron: lcg32: cycles 103434699 lcg64: cycles 80064205 mwc: cycles 110313786 prandom_u32: cycles 190311062 xoshiro128ss: cycles 110115475 xoroshiro128ss: cycles 100114345 sfc32: cycles 100163397 jsf32: cycles 100104957 jsf32a: cycles 110133030 gjrand: cycles 110122007 (The LCGs are not 128-bit state; they're just there as a speed baseine.) As you can see, the current lfsr113-based prandom_u32 takes almost twice the time of the aternatives. My quick & dirty test code is appended for anyone who wants to play. #include <stdint.h> typedef uint32_t u32; typedef uint64_t u64; struct rnd_state { u32 s1, s2, s3, s4; }; u32 lcg32(struct rnd_state *state) { u32 x = state->s1 * 2891336453 + 1; return state->s1 = x; } u32 lcg64(struct rnd_state *state) { u64 x = *(u64 *)&state->s1; //x = x * 0xf2fc5985 + 1; x = x * 6364136223846793005 + 1; *(u64 *)&state->s1 = x; return x >> 32; } u32 mwc(struct rnd_state *state) { u64 x = state->s1 + (u64)state->s2 * -(u32)1736; state->s2 = state->s3; state->s3 = state->s4; state->s1 = x >> 32; return state->s4 = (u32)x; } u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b) u32 x = state->s1 = TAUSWORTHE(state->s1, 6, 13, ~1u, 18); x ^= state->s2 = TAUSWORTHE(state->s2, 2, 27, ~7u, 2); //barrier(); x ^= state->s3 = TAUSWORTHE(state->s3, 13, 21, ~15u, 7); x ^= state->s4 = TAUSWORTHE(state->s4, 3, 12, ~127u, 13); return x; } static inline uint32_t rol32(uint32_t x, int k) { return x << k | x >> (32 - k); } uint32_t xoshiro128ss(uint32_t s[4]) { #if 0 const uint32_t res = rol32(s[0] * 5, 7) * 9; const uint32_t t = s[1] << 9; s[2] ^= s[0]; s[3] ^= s[1]; s[1] ^= s[2]; s[0] ^= s[3]; s[2] ^= t; s[3] = rol32(s[3], 11); return res; #else /* xoshiro128** */ uint32_t a = s[0], b = s[1], c = s[2] ^ a, d = s[3] ^ b; s[0] = a ^ d; s[1] = b ^ c; s[2] = c ^ (b << 9); s[3] = rol32(d, 11); return rol32(a * 5, 7) * 9; /* The "**" non-linear function */ #endif } static inline uint64_t rol64(uint64_t x, int k) { return x << k | x >> (64 - k); } uint64_t xoroshiro128ss(uint64_t s[2]) { const uint64_t s0 = s[0]; uint64_t s1 = s[1] ^ s0; const uint64_t res = rol64(s0 * 5, 7) * 9; s[1] = rol64(s1, 37); // c s[0] = rol64(s0, 24) ^ s1 ^ (s1 << 16); // a, b return res; } uint32_t sfc32(uint32_t s[4]) { uint32_t b = s[1], c = s[2]; uint32_t const res = s[0] + b + s[3]++; // good sets include {21,9,3},{15,8,3}; // older versions used {25,8,3} which wasn't as good s[0] = b ^ (b >> 9); s[1] = c + (c << 3); s[2] = rol32(c, 21) + res; return res; } /* http://burtleburtle.net/bob/rand/smallprng.html */ uint32_t jsf32(uint32_t s[4]) { uint32_t e = s[0] - rol32(s[1], 27); s[0] = s[1] ^ rol32(s[2], 17); s[1] = s[2] + s[3]; s[2] = s[3] + e; return s[3] = e + s[0]; } uint32_t jsf32a(uint32_t s[4]) { uint32_t e = s[0] - rol32(s[1], 23); s[0] = s[1] ^ rol32(s[2], 16); s[1] = s[2] + rol32(s[3], 11); s[2] = s[3] + e; return s[3] = e + s[0]; } /* See https://gist.github.com/imneme/7a783e20f71259cc13e219829bcea4ac */ uint32_t gjrand(uint32_t s[4]) { uint32_t a = s[0], b = s[1], c = s[2], d = s[3] += 0x96a5; b += c; a = rol32(a, 16); c ^= b; a += b; c = rol32(c, 11); b ^= a; a += c; b = rol32(b, 19); c += a; b += d; s[0] = a; s[1] = b; s[2] = c; return a; } #include <stdio.h> static uint64_t rdtsc() { uint32_t low, high; asm volatile("rdtsc" : "=a" (low), "=d" (high)); return (uint64_t)high << 32 | low; } typedef uint32_t (*rng_func)(uint64_t s[2]); #define FUNCS 10 int main(void) { static const rng_func funcs[FUNCS] = { (rng_func)lcg32, (rng_func)lcg64, (rng_func)mwc, (rng_func)prandom_u32_state, (rng_func)xoshiro128ss, (rng_func)xoroshiro128ss, (rng_func)sfc32, (rng_func)jsf32, (rng_func)jsf32a, (rng_func)gjrand }; static const char rng_names[FUNCS][16] = { " lcg32", " lcg64", " mwc", " prandom_u32", " xoshiro128ss", "xoroshiro128ss", " sfc32", " jsf32", " jsf32a", " gjrand" }; uint32_t sums[FUNCS]; int i, j, k; uint64_t timing[FUNCS], min_timing[FUNCS]; for (k = 0; k < 10; k++) { for (i = 0; i < FUNCS; i++) { uint32_t sum = 0; uint64_t s[2] = { -1ul, -1ul }; uint64_t start = rdtsc(); asm("" : "+g" (start)); for (j = 0; j < 10000000; j++) sum += funcs[i](s); asm("" : "+g" (start)); timing[i] = rdtsc() - start; sums[i] = sum; } for (i = 0; i < FUNCS; i++) { printf("%s: sum %08x cycles %lu\n", rng_names[i], sums[i], timing[i]); if (!k || timing[i] < min_timing[i]) min_timing[i] = timing[i]; } } puts("Minimum timings:"); for (i = 0; i < FUNCS; i++) printf("%s: cycles %lu\n", rng_names[i], min_timing[i]); return 0; }
