On Wed, Apr 09, 2014 at 05:16:55PM +0100, Mindaugas Rasiukevicius wrote: > > Perhaps I missed it, but were the benchmark results published? Also, how > about cprng_fast32/64()? If the current mechanism is indeed faster, then > I am glad that you have made an improvement.
To quote http://mail-index.netbsd.org/tech-crypto/2011/12/09/msg000538.html: This generator is approximately 20 times as fast as the old generator (dd with bs=64K yields 53MB/sec on 2Ghz Core2 instead of 2.5MB/sec) and also uses a separate mutex per instance so concurrency is greatly improved. I made a number of other performance improvements after that, but unless you insist let's leave off the subject of cprng_strong() for a bit. Attached is a patch which makes cprng_fast per-CPU and lockless. *IT IS NOT WELL TESTED YET (I haven't even run test vectors) AND IS ONLY FOR REVIEW.* The diff is against the head of the tls-earlyentropy branch. I would *very* much appreciate comments of any kind on this patch. Some notes: 1) This patch replaces RC4 with the HC-128 stream cipher. HC-128 is one of the selected eSTREAM ciphers. It is mature, well analyzed, and produces an output stream of 32-bit values which makes it well suited to our application. Unlike many of the other eSTREAM final portfolio ciphers it appears to perform well in a fairly naive, portable C implementation. Keying is slow but I've dealt with that. RC4 is at this point dangerously broken and HC-128 actually appears to be faster, too. 2) This patch wraps the actual manipulation of the HC-128 state with splhigh(). The alternative would be to prohibit use of cprng_fast* from interrupt context. Neither is great but either is better than a global spin mutex! Comments on this in particular I would appreciate. For reference, the operations in question cost about 4-6 cycles per output byte, and we don't usually ask for many output bytes, so we're not at IPL_HIGH for long. 3) This patch replaces direct keying of cprng_fast from the entropy pool with indirect keying via the kernel_cprng. This is done from a soft interrupt which is scheduled when (long before, actually) rekeying is needed. To make this work I had to adjust the initialization of the NFS filesystem module which was calling cprng_fast() *very* early. 4) There are some moderately ugly macro tricks here to avoid copying output into alignment unless truly necessary and to avoid conditional execution at runtime. I think they are worth it. The copy likely costs almost as much as the actual stream generation and we all know what branch mispredictions cost. The alignment math is expressed with / and % because I find it more clear but I've looked at the asm output and it appears the compiler does the obvious optimizations. 5) The HC-128 implementation is Lucas Vella's from libestream, slightly reorganized and KNFed. Patch is attached. Thanks very much in advance for any comments. Thor
? kern/.init_main.c.swp ? sys/.cprng.h.swo Index: conf/files =================================================================== RCS file: /cvsroot/src/sys/conf/files,v retrieving revision 1.1090 diff -u -r1.1090 files --- conf/files 1 Apr 2014 17:49:30 -0000 1.1090 +++ conf/files 17 Apr 2014 01:20:26 -0000 @@ -160,6 +160,7 @@ include "crypto/rijndael/files.rijndael" include "crypto/skipjack/files.skipjack" include "crypto/camellia/files.camellia" +include "crypto/hc128/files.hc128" # General-purpose crypto processing framework. include "opencrypto/files.opencrypto" Index: kern/init_main.c =================================================================== RCS file: /cvsroot/src/sys/kern/init_main.c,v retrieving revision 1.454.2.1 diff -u -r1.454.2.1 init_main.c --- kern/init_main.c 7 Apr 2014 02:20:00 -0000 1.454.2.1 +++ kern/init_main.c 17 Apr 2014 01:20:27 -0000 @@ -497,6 +497,8 @@ /* Initialize the kernel strong PRNG. */ kern_cprng = cprng_strong_create("kernel", IPL_VM, CPRNG_INIT_ANY|CPRNG_REKEY_ANY); + + cprf_init(); /* Initialize interfaces. */ ifinit1(); Index: kern/subr_cprng.c =================================================================== RCS file: /cvsroot/src/sys/kern/subr_cprng.c,v retrieving revision 1.23 diff -u -r1.23 subr_cprng.c --- kern/subr_cprng.c 17 Jan 2014 02:12:48 -0000 1.23 +++ kern/subr_cprng.c 17 Apr 2014 01:20:27 -0000 @@ -43,6 +43,7 @@ #include <sys/kmem.h> #include <sys/lwp.h> #include <sys/once.h> +#include <sys/percpu.h> #include <sys/poll.h> /* XXX POLLIN/POLLOUT/&c. */ #include <sys/select.h> #include <sys/systm.h> @@ -54,6 +55,7 @@ #endif #include <crypto/nist_ctr_drbg/nist_ctr_drbg.h> +#include <crypto/hc128/hc128.h> #if defined(__HAVE_CPU_COUNTER) #include <machine/cpu_counter.h> @@ -72,6 +74,13 @@ static rndsink_callback_t cprng_strong_rndsink_callback; +percpu_t *percpu_cprf_ctx; +static int cprf_initialized; + +static void cprf_randrekey(cprf_ctx_t *); + +void *cprf_rekey_softintr = NULL; + void cprng_init(void) { @@ -103,10 +112,11 @@ return cpu_counter32(); #endif if (__predict_false(cold)) { + static int ctr; /* microtime unsafe if clock not running yet */ - return 0; + return ctr++; } - microtime(&tv); + getmicrotime(&tv); return (tv.tv_sec * 1000000 + tv.tv_usec); } @@ -532,8 +542,16 @@ } /* - * sysctl helper routine for kern.arandom node. Picks a random number - * for you. + * sysctl helper routine for kern.arandom node. Fills the supplied + * structure with random data for you. + * + * This node was originally declared as type "int" but its implementation + * in OpenBSD, whence it came, would happily return up to 8K of data if + * requested. Evidently this was used to key RC4 in userspace. + * + * In NetBSD, the libc stack-smash-protection code reads 64 bytes + * from here at every program startup. So though it would be nice + * to make this node return only 32 or 64 bits, we can't. Too bad! */ static int sysctl_kern_arnd(SYSCTLFN_ARGS) @@ -542,31 +560,143 @@ void *v; struct sysctlnode node = *rnode; - if (*oldlenp == 0) + switch (*oldlenp) { + case 0: return 0; + default: + if (*oldlenp > 256) { + return E2BIG; + } + v = kmem_alloc(*oldlenp, KM_SLEEP); + cprng_fast(v, *oldlenp); + node.sysctl_data = v; + node.sysctl_size = *oldlenp; + error = sysctl_lookup(SYSCTLFN_CALL(&node)); + kmem_free(v, *oldlenp); + return error; + } +} + +static void +cprf_randrekey(cprf_ctx_t *ctx) +{ + uint8_t key[16], iv[16]; + hc128_state_t tempstate; + int s; + + int have_initial = rnd_initial_entropy; + + cprng_strong(kern_cprng, key, sizeof(key), FASYNC); + cprng_strong(kern_cprng, iv, sizeof(iv), FASYNC); + + /* Rekey the hc128 state - expensive, don't do this at splhigh. */ + hc128_init(&ctx->hc128, key, iv); + explicit_memset(key, 0, sizeof(key)); + explicit_memset(iv, 0, sizeof(iv)); + + s = splhigh(); + memcpy(&ctx->hc128, &tempstate, sizeof(tempstate)); + splx(s); + + explicit_memset(&tempstate, 0, sizeof(tempstate)); + /* - * This code used to allow sucking 8192 bytes at a time out - * of the kernel arc4random generator. Evidently there is some - * very old OpenBSD application code that may try to do this. - * - * Note that this node is documented as type "INT" -- 4 or 8 - * bytes, not 8192. - * - * We continue to support this abuse of the "len" pointer here - * but only 256 bytes at a time, as, anecdotally, the actual - * application use here was to generate RC4 keys in userspace. - * - * Support for such large requests will probably be removed - * entirely in the future. + * Reset for next reseed cycle. */ - if (*oldlenp > 256) - return E2BIG; + ctx->nextreseed = time_uptime + + (have_initial ? CPRF_RESEED_SECONDS : 0); + ctx->numbytes = 0; +} + +static void +cprf_init_ctx(void *v, + void *arg __unused, + struct cpu_info * ci __unused) +{ + cprf_ctx_t *ctx = v; + cprf_randrekey(ctx); +} + +static void +cprf_rekey_one(void *arg __unused) +{ + cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx); + cprf_randrekey(ctx); +} + +void +cprf_init(void) +{ + percpu_cprf_ctx = percpu_alloc(sizeof(cprf_ctx_t)); + percpu_foreach(percpu_cprf_ctx, cprf_init_ctx, NULL); + cprf_initialized++; + cprf_rekey_softintr = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE, + cprf_rekey_one, NULL); +} + +size_t +_cprng_fast_exact(void *p, size_t len) +{ + uint32_t *pi = p, *iter; + int s; + size_t ilen = len / sizeof(*pi); + cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx); + + KDASSERT(cprf_initialized); + KDASSERT(0 == ((ptrdiff_t)p % sizeof(uint32_t))); + KDASSERT(ilen * sizeof(*pi) == len); + + _cprf_checkrekey(ctx); + + s = splhigh(); + for (iter = pi; iter < pi + ilen; iter++) { + hc128_extract(&ctx->hc128, (uint8_t *)iter); + } + splx(s); + + ctx->numbytes += len; + percpu_putref(percpu_cprf_ctx); + return len; +} + +size_t +_cprng_fast_inexact(void *p, size_t len) +{ + uint8_t *pc = p; + uint32_t *pi = p, tmp, *iter; + int s; + size_t initial_len, aligned_len, final_len, main_len; + cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx); + + KDASSERT(cprf_initialized); + + initial_len = sizeof(uint32_t) - ((ptrdiff_t)pc % sizeof(uint32_t)); + aligned_len = len - initial_len; + final_len = aligned_len % sizeof(uint32_t); + main_len = aligned_len - final_len; + + main_len /= sizeof(uint32_t); + + _cprf_checkrekey(ctx); + + s = splhigh(); + if (initial_len) { + hc128_extract(&ctx->hc128, (uint8_t *)&tmp); + memcpy(pc, &tmp, initial_len); + pi = (uint32_t *)pc; + } + + for (iter = pi; iter < pi + main_len ; iter++) { + hc128_extract(&ctx->hc128, (uint8_t *)iter); + } + + if (final_len) { + hc128_extract(&ctx->hc128, (uint8_t *)&tmp); + memcpy(pi + main_len, &tmp, final_len); + } + splx(s); - v = kmem_alloc(*oldlenp, KM_SLEEP); - cprng_fast(v, *oldlenp); - node.sysctl_data = v; - node.sysctl_size = *oldlenp; - error = sysctl_lookup(SYSCTLFN_CALL(&node)); - kmem_free(v, *oldlenp); - return error; + ctx->numbytes += len; + percpu_putref(percpu_cprf_ctx); + return len; } Index: lib/libkern/Makefile.libkern =================================================================== RCS file: /cvsroot/src/sys/lib/libkern/Makefile.libkern,v retrieving revision 1.32.2.1 diff -u -r1.32.2.1 Makefile.libkern --- lib/libkern/Makefile.libkern 7 Apr 2014 01:10:55 -0000 1.32.2.1 +++ lib/libkern/Makefile.libkern 17 Apr 2014 01:20:27 -0000 @@ -54,7 +54,7 @@ SRCS+= bswap64.c .endif SRCS+= md4c.c md5c.c rmd160.c sha1.c sha2.c murmurhash.c -SRCS+= pmatch.c arc4random.c bcd.c mcount.c mertwist.c crc32.c +SRCS+= pmatch.c bcd.c mcount.c mertwist.c crc32.c SRCS+= ppath_kmem_alloc.c Index: nfs/nfs_subs.c =================================================================== RCS file: /cvsroot/src/sys/nfs/nfs_subs.c,v retrieving revision 1.225 diff -u -r1.225 nfs_subs.c --- nfs/nfs_subs.c 17 Mar 2014 09:35:24 -0000 1.225 +++ nfs/nfs_subs.c 17 Apr 2014 01:20:27 -0000 @@ -1489,7 +1489,6 @@ nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; - nfs_xid = cprng_fast32(); nfsdreq_init(); /* @@ -1994,6 +1993,10 @@ { u_int32_t newxid; + if (__predict_false(nfs_xid == 0)) { + nfs_xid = cprng_fast32(); + } + /* get next xid. skip 0 */ do { newxid = atomic_inc_32_nv(&nfs_xid); Index: sys/cprng.h =================================================================== RCS file: /cvsroot/src/sys/sys/cprng.h,v retrieving revision 1.9 diff -u -r1.9 cprng.h --- sys/cprng.h 17 Jan 2014 02:08:56 -0000 1.9 +++ sys/cprng.h 17 Apr 2014 01:20:27 -0000 @@ -41,42 +41,91 @@ #include <sys/rnd.h> /* XXX users bogusly transitively need this */ #include <crypto/nist_ctr_drbg/nist_ctr_drbg.h> +#include <crypto/hc128/hc128.h> +#include <sys/percpu.h> +#include <sys/intr.h> /* * NIST SP800-90 says 2^19 bytes per request for the CTR_DRBG. */ #define CPRNG_MAX_LEN 524288 +#define CPRF_MAXBYTES (512 * 1024 * 1024) +#define CPRF_HARDMAX (1 * 1024 * 1024 * 1024) +#define CPRF_RESEED_SECONDS 600 + +typedef struct { + hc128_state_t hc128; + int numbytes; + time_t nextreseed; +} cprf_ctx_t; + /* - * We do not want an arc4random() prototype available to anyone. + * This is a macro so we can skip any conditional logic at runtime if + * the size provided is a multiple of the underlying stream cipher + * blocksize, e.g. sizeof(padded struct). */ -void _arc4randbytes(void *, size_t); -uint32_t _arc4random(void); +#define cprng_fast(p, len) ((0 == (len % sizeof(uint32_t))) && \ + (0 == ((ptrdiff_t)p % sizeof(uint32_t))) ? \ + _cprng_fast_exact(p, len) : \ + _cprng_fast_inexact(p, len)) + +size_t _cprng_fast_exact(void *, size_t); +size_t _cprng_fast_inexact(void *, size_t); -static inline size_t -cprng_fast(void *p, size_t len) +static inline void +_cprf_checkrekey(cprf_ctx_t *ctx) { - _arc4randbytes(p, len); - return len; + extern void *cprf_rekey_softintr; + + if (__predict_false((ctx->numbytes > CPRF_MAXBYTES) || + (time_uptime > ctx->nextreseed))) { + /* Schedule a deferred reseed */ + softint_schedule(cprf_rekey_softintr); + } } -static inline uint32_t -cprng_fast32(void) +static inline uint32_t cprng_fast32(void) { - return _arc4random(); + uint32_t ret; + extern percpu_t *percpu_cprf_ctx; + cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx); + int s; + + _cprf_checkrekey(ctx); + + s = splhigh(); + hc128_extract(&ctx->hc128, (uint8_t *)&ret); + splx(s); + + ctx->numbytes += sizeof(uint32_t); + percpu_putref(percpu_cprf_ctx); + return ret; } -static inline uint64_t -cprng_fast64(void) +static inline uint64_t cprng_fast64(void) { - uint64_t r; - _arc4randbytes(&r, sizeof(r)); - return r; + uint64_t ret; + extern percpu_t *percpu_cprf_ctx; + cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx); + int s; + + _cprf_checkrekey(ctx); + + s = splhigh(); + hc128_extract(&ctx->hc128, (uint8_t *)&ret); + hc128_extract(&ctx->hc128, (uint8_t *)(((uint32_t *)&ret) + 1)); + splx(s); + + ctx->numbytes += sizeof(uint64_t); + percpu_putref(percpu_cprf_ctx); + return ret; } typedef struct cprng_strong cprng_strong_t; void cprng_init(void); +void cprf_init(void); #define CPRNG_INIT_ANY 0x00000001 #define CPRNG_REKEY_ANY 0x00000002