Module Name: src Committed By: riastradh Date: Mon Jun 29 23:50:05 UTC 2020
Modified Files: src/sys/crypto/aes/arch/x86: aes_sse2.h aes_sse2_impl.c aes_sse2_impl.h files.aessse2 Added Files: src/sys/crypto/aes/arch/x86: aes_sse2_subr.c Log Message: Split SSE2 logic into separate units. Ensure that there are no paths into files compiled with -msse -msse2 at all except via fpu_kern_enter. I didn't run into a practical problem with this, but let's not leave a ticking time bomb for subsequent toolchain changes in case the mere declaration of local __m128i variables causes trouble. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/x86/aes_sse2.h \ src/sys/crypto/aes/arch/x86/aes_sse2_impl.c \ src/sys/crypto/aes/arch/x86/aes_sse2_impl.h \ src/sys/crypto/aes/arch/x86/files.aessse2 cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_sse2_subr.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/x86/aes_sse2.h diff -u src/sys/crypto/aes/arch/x86/aes_sse2.h:1.1 src/sys/crypto/aes/arch/x86/aes_sse2.h:1.2 --- src/sys/crypto/aes/arch/x86/aes_sse2.h:1.1 Mon Jun 29 23:47:54 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2.h Mon Jun 29 23:50:05 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_sse2.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ +/* $NetBSD: aes_sse2.h,v 1.2 2020/06/29 23:50:05 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -31,6 +31,31 @@ #include <crypto/aes/aes.h> +/* + * These functions MUST NOT use any vector registers for parameters or + * results -- the caller is compiled with -mno-sse &c. in the kernel, + * and dynamically turns on the vector unit just before calling them. + * Internal subroutines that use the vector unit for parameters are + * declared in aes_sse2_impl.h instead. + */ + +void aes_sse2_setkey(uint64_t[static 30], const void *, uint32_t); + +void aes_sse2_enc(const struct aesenc *, const uint8_t in[static 16], + uint8_t[static 16], uint32_t); +void aes_sse2_dec(const struct aesdec *, const uint8_t in[static 16], + uint8_t[static 16], uint32_t); +void aes_sse2_cbc_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_cbc_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_xts_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_xts_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); + +int aes_sse2_selftest(void); + extern struct aes_impl aes_sse2_impl; #endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_H */ Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.c diff -u src/sys/crypto/aes/arch/x86/aes_sse2_impl.c:1.1 src/sys/crypto/aes/arch/x86/aes_sse2_impl.c:1.2 --- src/sys/crypto/aes/arch/x86/aes_sse2_impl.c:1.1 Mon Jun 29 23:47:54 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.c Mon Jun 29 23:50:05 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ +/* $NetBSD: aes_sse2_impl.c,v 1.2 2020/06/29 23:50:05 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,11 +27,10 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.2 2020/06/29 23:50:05 riastradh Exp $"); #include <sys/types.h> #include <sys/endian.h> -#include <sys/systm.h> #include <crypto/aes/aes.h> #include <crypto/aes/arch/x86/aes_sse2.h> @@ -41,532 +40,99 @@ __KERNEL_RCSID(1, "$NetBSD: aes_sse2_imp #include <x86/fpu.h> #include <x86/specialreg.h> -#include "aes_sse2_impl.h" - static void -aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds) +aes_sse2_setenckey_impl(struct aesenc *enc, const uint8_t *key, + uint32_t nrounds) { - size_t key_len; - - switch (nrounds) { - case 10: - key_len = 16; - break; - case 12: - key_len = 24; - break; - case 14: - key_len = 32; - break; - default: - panic("invalid AES nrounds: %u", nrounds); - } fpu_kern_enter(); - aes_sse2_keysched(rk, key, key_len); - fpu_kern_leave(); -} - -static void -aes_sse2_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds) -{ - aes_sse2_setkey(enc->aese_aes.aes_rk64, key, nrounds); + fpu_kern_leave(); } static void -aes_sse2_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds) +aes_sse2_setdeckey_impl(struct aesdec *dec, const uint8_t *key, + uint32_t nrounds) { + fpu_kern_enter(); /* * BearSSL computes InvMixColumns on the fly -- no need for * distinct decryption round keys. */ aes_sse2_setkey(dec->aesd_aes.aes_rk64, key, nrounds); + fpu_kern_leave(); } static void -aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16], +aes_sse2_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], uint32_t nrounds) { - uint64_t sk_exp[120]; - __m128i q[4]; fpu_kern_enter(); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load input block interleaved with garbage blocks. */ - q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); - q[1] = q[2] = q[3] = _mm_setzero_si128(); - - /* Transform to bitslice, decrypt, transform from bitslice. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store output block. */ - _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - + aes_sse2_enc(enc, in, out, nrounds); fpu_kern_leave(); } static void -aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16], +aes_sse2_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], uint8_t out[static 16], uint32_t nrounds) { - uint64_t sk_exp[120]; - __m128i q[4]; fpu_kern_enter(); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); - - /* Load input block interleaved with garbage blocks. */ - q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); - q[1] = q[2] = q[3] = _mm_setzero_si128(); - - /* Transform to bitslice, decrypt, transform from bitslice. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store output block. */ - _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - + aes_sse2_dec(dec, in, out, nrounds); fpu_kern_leave(); } static void -aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], +aes_sse2_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], uint32_t nrounds) { - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i cv; - - KASSERT(nbytes % 16 == 0); - /* Skip if there's nothing to do. */ if (nbytes == 0) return; - fpu_kern_enter(); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load the IV. */ - cv = _mm_loadu_epi8(iv); - - for (; nbytes; nbytes -= 16, in += 16, out += 16) { - /* Load input block and apply CV. */ - q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in)); - - /* Transform to bitslice, encrypt, transform from bitslice. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Remember ciphertext as CV and store output block. */ - cv = aes_sse2_interleave_out(q[0]); - _mm_storeu_epi8(out, cv); - } - - /* Store updated IV. */ - _mm_storeu_epi8(iv, cv); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - + aes_sse2_cbc_enc(enc, in, out, nbytes, iv, nrounds); fpu_kern_leave(); } static void -aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16], +aes_sse2_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], uint32_t nrounds) { - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i cv, iv, w; - KASSERT(nbytes % 16 == 0); - - /* Skip if there's nothing to do. */ if (nbytes == 0) return; - fpu_kern_enter(); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); - - /* Load the IV. */ - iv = _mm_loadu_epi8(ivp); - - /* Load the last cipher block. */ - cv = _mm_loadu_epi8(in + nbytes - 16); - - /* Store the updated IV. */ - _mm_storeu_epi8(ivp, cv); - - /* Process the last blocks if not an even multiple of four. */ - if (nbytes % (4*16)) { - unsigned n = (nbytes/16) % 4; - - KASSERT(n > 0); - KASSERT(n < 4); - - q[1] = q[2] = q[3] = _mm_setzero_si128(); - q[n - 1] = aes_sse2_interleave_in(cv); - switch (nbytes % 64) { - case 48: - w = _mm_loadu_epi8(in + nbytes - 32); - q[1] = aes_sse2_interleave_in(w); - /*FALLTHROUGH*/ - case 32: - w = _mm_loadu_epi8(in + nbytes - 48); - q[0] = aes_sse2_interleave_in(w); - /*FALLTHROUGH*/ - case 16: - break; - } - - /* Decrypt. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - do { - n--; - w = aes_sse2_interleave_out(q[n]); - if ((nbytes -= 16) == 0) - goto out; - cv = _mm_loadu_epi8(in + nbytes - 16); - _mm_storeu_epi8(out + nbytes, w ^ cv); - } while (n); - } - - for (;;) { - KASSERT(nbytes >= 64); - nbytes -= 64; - - /* - * 1. Set up upper cipher block from cv. - * 2. Load lower cipher block into cv and set it up. - * 3. Decrypt. - */ - q[3] = aes_sse2_interleave_in(cv); - - w = _mm_loadu_epi8(in + nbytes + 4*8); - q[2] = aes_sse2_interleave_in(w); - - w = _mm_loadu_epi8(in + nbytes + 4*4); - q[1] = aes_sse2_interleave_in(w); - - w = _mm_loadu_epi8(in + nbytes + 4*0); - q[0] = aes_sse2_interleave_in(w); - - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the upper output block. */ - w = aes_sse2_interleave_out(q[3]); - cv = _mm_loadu_epi8(in + nbytes + 4*8); - _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv); - - /* Store the middle output blocks. */ - w = aes_sse2_interleave_out(q[2]); - cv = _mm_loadu_epi8(in + nbytes + 4*4); - _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv); - - w = aes_sse2_interleave_out(q[1]); - cv = _mm_loadu_epi8(in + nbytes + 4*0); - _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv); - - /* - * Get the first output block, but don't load the CV - * yet -- it might be the previous ciphertext block, or - * it might be the IV. - */ - w = aes_sse2_interleave_out(q[0]); - - /* Stop if we've reached the first output block. */ - if (nbytes == 0) - goto out; - - /* - * Load the preceding cipher block, and apply it as the - * chaining value to this one. - */ - cv = _mm_loadu_epi8(in + nbytes - 16); - _mm_storeu_epi8(out + nbytes, w ^ cv); - } - -out: /* Store the first output block. */ - _mm_storeu_epi8(out, w ^ iv); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - + aes_sse2_cbc_dec(dec, in, out, nbytes, iv, nrounds); fpu_kern_leave(); } -static inline __m128i -aes_sse2_xts_update(__m128i t) -{ - const __m128i one = _mm_set_epi64x(1, 1); - __m128i s, m, c; - - s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ - m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ - m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ - c = _mm_set_epi64x(1, 0x87); /* carry */ - - return _mm_slli_epi64(t, 1) ^ (c & ~m); -} - -static int -aes_sse2_xts_update_selftest(void) -{ - static const struct { - uint32_t in[4], out[4]; - } cases[] = { - [0] = { {1}, {2} }, - [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, - [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, - [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, - [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, - [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, - }; - unsigned i; - uint32_t t[4]; - int result = 0; - - for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { - t[0] = cases[i].in[0]; - t[1] = cases[i].in[1]; - t[2] = cases[i].in[2]; - t[3] = cases[i].in[3]; - _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t))); - if (t[0] != cases[i].out[0] || - t[1] != cases[i].out[1] || - t[2] != cases[i].out[2] || - t[3] != cases[i].out[3]) { - printf("%s %u:" - " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", - __func__, i, t[0], t[1], t[2], t[3]); - result = -1; - } - } - - return result; -} - static void -aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], +aes_sse2_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], uint32_t nrounds) { - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i w; - __m128i t[5]; - unsigned i; - KASSERT(nbytes % 16 == 0); - - /* Skip if there's nothing to do. */ if (nbytes == 0) return; - fpu_kern_enter(); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load tweak. */ - t[0] = _mm_loadu_epi8(tweak); - - /* Handle the first block separately if odd number. */ - if (nbytes % (4*16)) { - /* Load up the tweaked inputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - for (; i < 4; i++) - q[i] = _mm_setzero_si128(); - - /* Encrypt up to four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[i]; - in += nbytes % (4*16); - out += nbytes % (4*16); - nbytes -= nbytes % (4*16); - if (nbytes == 0) - goto out; - } - - do { - KASSERT(nbytes % 64 == 0); - KASSERT(nbytes >= 64); - - /* Load up the tweaked inputs. */ - for (i = 0; i < 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - - /* Encrypt four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[4]; - in += 64; - out += 64; - nbytes -= 64; - } while (nbytes); - -out: /* Store the updated tweak. */ - _mm_storeu_epi8(tweak, t[0]); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - explicit_memset(t, 0, sizeof t); - + aes_sse2_xts_enc(enc, in, out, nbytes, tweak, nrounds); fpu_kern_leave(); } static void -aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], +aes_sse2_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], uint32_t nrounds) { - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i w; - __m128i t[5]; - unsigned i; - KASSERT(nbytes % 16 == 0); - - /* Skip if there's nothing to do. */ if (nbytes == 0) return; - fpu_kern_enter(); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); - - /* Load tweak. */ - t[0] = _mm_loadu_epi8(tweak); - - /* Handle the first block separately if odd number. */ - if (nbytes % (4*16)) { - /* Load up the tweaked inputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - for (; i < 4; i++) - q[i] = _mm_setzero_si128(); - - /* Decrypt up to four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[i]; - in += nbytes % (4*16); - out += nbytes % (4*16); - nbytes -= nbytes % (4*16); - if (nbytes == 0) - goto out; - } - - do { - KASSERT(nbytes % 64 == 0); - KASSERT(nbytes >= 64); - - /* Load up the tweaked inputs. */ - for (i = 0; i < 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - - /* Decrypt four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[4]; - in += 64; - out += 64; - nbytes -= 64; - } while (nbytes); - -out: /* Store the updated tweak. */ - _mm_storeu_epi8(tweak, t[0]); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - explicit_memset(t, 0, sizeof t); - + aes_sse2_xts_dec(dec, in, out, nbytes, tweak, nrounds); fpu_kern_leave(); } @@ -582,30 +148,21 @@ aes_sse2_probe(void) return -1; fpu_kern_enter(); - - if (aes_sse2_xts_update_selftest()) - result = -1; - + result = aes_sse2_selftest(); fpu_kern_leave(); - /* XXX test aes_sse2_bitslice_decrypt */ - /* XXX test aes_sse2_bitslice_encrypt */ - /* XXX test aes_sse2_keysched */ - /* XXX test aes_sse2_ortho */ - /* XXX test aes_sse2_skey_expand */ - return result; } struct aes_impl aes_sse2_impl = { .ai_name = "Intel SSE2 bitsliced", .ai_probe = aes_sse2_probe, - .ai_setenckey = aes_sse2_setenckey, - .ai_setdeckey = aes_sse2_setdeckey, - .ai_enc = aes_sse2_enc, - .ai_dec = aes_sse2_dec, - .ai_cbc_enc = aes_sse2_cbc_enc, - .ai_cbc_dec = aes_sse2_cbc_dec, - .ai_xts_enc = aes_sse2_xts_enc, - .ai_xts_dec = aes_sse2_xts_dec, + .ai_setenckey = aes_sse2_setenckey_impl, + .ai_setdeckey = aes_sse2_setdeckey_impl, + .ai_enc = aes_sse2_enc_impl, + .ai_dec = aes_sse2_dec_impl, + .ai_cbc_enc = aes_sse2_cbc_enc_impl, + .ai_cbc_dec = aes_sse2_cbc_dec_impl, + .ai_xts_enc = aes_sse2_xts_enc_impl, + .ai_xts_dec = aes_sse2_xts_dec_impl, }; Index: src/sys/crypto/aes/arch/x86/aes_sse2_impl.h diff -u src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.1 src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.2 --- src/sys/crypto/aes/arch/x86/aes_sse2_impl.h:1.1 Mon Jun 29 23:47:54 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_impl.h Mon Jun 29 23:50:05 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_sse2_impl.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */ +/* $NetBSD: aes_sse2_impl.h,v 1.2 2020/06/29 23:50:05 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -31,6 +31,8 @@ #include <sys/types.h> +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/x86/aes_sse2.h> #include <crypto/aes/arch/x86/immintrin.h> #include <crypto/aes/arch/x86/immintrin_ext.h> Index: src/sys/crypto/aes/arch/x86/files.aessse2 diff -u src/sys/crypto/aes/arch/x86/files.aessse2:1.1 src/sys/crypto/aes/arch/x86/files.aessse2:1.2 --- src/sys/crypto/aes/arch/x86/files.aessse2:1.1 Mon Jun 29 23:47:54 2020 +++ src/sys/crypto/aes/arch/x86/files.aessse2 Mon Jun 29 23:50:05 2020 @@ -1,11 +1,12 @@ -# $NetBSD: files.aessse2,v 1.1 2020/06/29 23:47:54 riastradh Exp $ +# $NetBSD: files.aessse2,v 1.2 2020/06/29 23:50:05 riastradh Exp $ -makeoptions aes "COPTS.aes_sse2.c"+="-msse2" -makeoptions aes "COPTS.aes_sse2_dec.c"+="-msse2" -makeoptions aes "COPTS.aes_sse2_enc.c"+="-msse2" -makeoptions aes "COPTS.aes_sse2_impl.c"+="-msse2" +makeoptions aes "COPTS.aes_sse2.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_dec.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_enc.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_subr.c"+="-msse -msse2" file crypto/aes/arch/x86/aes_sse2.c aes file crypto/aes/arch/x86/aes_sse2_dec.c aes file crypto/aes/arch/x86/aes_sse2_enc.c aes file crypto/aes/arch/x86/aes_sse2_impl.c aes +file crypto/aes/arch/x86/aes_sse2_subr.c aes Added files: Index: src/sys/crypto/aes/arch/x86/aes_sse2_subr.c diff -u /dev/null src/sys/crypto/aes/arch/x86/aes_sse2_subr.c:1.1 --- /dev/null Mon Jun 29 23:50:05 2020 +++ src/sys/crypto/aes/arch/x86/aes_sse2_subr.c Mon Jun 29 23:50:05 2020 @@ -0,0 +1,526 @@ +/* $NetBSD: aes_sse2_subr.c,v 1.1 2020/06/29 23:50:05 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.1 2020/06/29 23:50:05 riastradh Exp $"); + +#include <sys/systm.h> + +#include <lib/libkern/libkern.h> + +#include <crypto/aes/aes.h> +#include <crypto/aes/arch/x86/aes_sse2.h> + +#include "aes_sse2_impl.h" + +void +aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds) +{ + size_t key_len; + + switch (nrounds) { + case 10: + key_len = 16; + break; + case 12: + key_len = 24; + break; + case 14: + key_len = 32; + break; + default: + panic("invalid AES nrounds: %u", nrounds); + } + + aes_sse2_keysched(rk, key, key_len); +} + +void +aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load input block interleaved with garbage blocks. */ + q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); + q[1] = q[2] = q[3] = _mm_setzero_si128(); + + /* Transform to bitslice, decrypt, transform from bitslice. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store output block. */ + _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load input block interleaved with garbage blocks. */ + q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); + q[1] = q[2] = q[3] = _mm_setzero_si128(); + + /* Transform to bitslice, decrypt, transform from bitslice. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store output block. */ + _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i cv; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load the IV. */ + cv = _mm_loadu_epi8(iv); + + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + /* Load input block and apply CV. */ + q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in)); + + /* Transform to bitslice, encrypt, transform from bitslice. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Remember ciphertext as CV and store output block. */ + cv = aes_sse2_interleave_out(q[0]); + _mm_storeu_epi8(out, cv); + } + + /* Store updated IV. */ + _mm_storeu_epi8(iv, cv); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i cv, iv, w; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load the IV. */ + iv = _mm_loadu_epi8(ivp); + + /* Load the last cipher block. */ + cv = _mm_loadu_epi8(in + nbytes - 16); + + /* Store the updated IV. */ + _mm_storeu_epi8(ivp, cv); + + /* Process the last blocks if not an even multiple of four. */ + if (nbytes % (4*16)) { + unsigned n = (nbytes/16) % 4; + + KASSERT(n > 0); + KASSERT(n < 4); + + q[1] = q[2] = q[3] = _mm_setzero_si128(); + q[n - 1] = aes_sse2_interleave_in(cv); + switch (nbytes % 64) { + case 48: + w = _mm_loadu_epi8(in + nbytes - 32); + q[1] = aes_sse2_interleave_in(w); + /*FALLTHROUGH*/ + case 32: + w = _mm_loadu_epi8(in + nbytes - 48); + q[0] = aes_sse2_interleave_in(w); + /*FALLTHROUGH*/ + case 16: + break; + } + + /* Decrypt. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + do { + n--; + w = aes_sse2_interleave_out(q[n]); + if ((nbytes -= 16) == 0) + goto out; + cv = _mm_loadu_epi8(in + nbytes - 16); + _mm_storeu_epi8(out + nbytes, w ^ cv); + } while (n); + } + + for (;;) { + KASSERT(nbytes >= 64); + nbytes -= 64; + + /* + * 1. Set up upper cipher block from cv. + * 2. Load lower cipher block into cv and set it up. + * 3. Decrypt. + */ + q[3] = aes_sse2_interleave_in(cv); + + w = _mm_loadu_epi8(in + nbytes + 4*8); + q[2] = aes_sse2_interleave_in(w); + + w = _mm_loadu_epi8(in + nbytes + 4*4); + q[1] = aes_sse2_interleave_in(w); + + w = _mm_loadu_epi8(in + nbytes + 4*0); + q[0] = aes_sse2_interleave_in(w); + + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the upper output block. */ + w = aes_sse2_interleave_out(q[3]); + cv = _mm_loadu_epi8(in + nbytes + 4*8); + _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv); + + /* Store the middle output blocks. */ + w = aes_sse2_interleave_out(q[2]); + cv = _mm_loadu_epi8(in + nbytes + 4*4); + _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv); + + w = aes_sse2_interleave_out(q[1]); + cv = _mm_loadu_epi8(in + nbytes + 4*0); + _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv); + + /* + * Get the first output block, but don't load the CV + * yet -- it might be the previous ciphertext block, or + * it might be the IV. + */ + w = aes_sse2_interleave_out(q[0]); + + /* Stop if we've reached the first output block. */ + if (nbytes == 0) + goto out; + + /* + * Load the preceding cipher block, and apply it as the + * chaining value to this one. + */ + cv = _mm_loadu_epi8(in + nbytes - 16); + _mm_storeu_epi8(out + nbytes, w ^ cv); + } + +out: /* Store the first output block. */ + _mm_storeu_epi8(out, w ^ iv); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static inline __m128i +aes_sse2_xts_update(__m128i t) +{ + const __m128i one = _mm_set_epi64x(1, 1); + __m128i s, m, c; + + s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ + m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ + m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ + c = _mm_set_epi64x(1, 0x87); /* carry */ + + return _mm_slli_epi64(t, 1) ^ (c & ~m); +} + +static int +aes_sse2_xts_update_selftest(void) +{ + static const struct { + uint32_t in[4], out[4]; + } cases[] = { + [0] = { {1}, {2} }, + [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, + [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, + [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, + [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, + [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + }; + unsigned i; + uint32_t t[4]; + int result = 0; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + t[0] = cases[i].in[0]; + t[1] = cases[i].in[1]; + t[2] = cases[i].in[2]; + t[3] = cases[i].in[3]; + _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t))); + if (t[0] != cases[i].out[0] || + t[1] != cases[i].out[1] || + t[2] != cases[i].out[2] || + t[3] != cases[i].out[3]) { + printf("%s %u:" + " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", + __func__, i, t[0], t[1], t[2], t[3]); + result = -1; + } + } + + return result; +} + +void +aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i w; + __m128i t[5]; + unsigned i; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load tweak. */ + t[0] = _mm_loadu_epi8(tweak); + + /* Handle the first block separately if odd number. */ + if (nbytes % (4*16)) { + /* Load up the tweaked inputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + for (; i < 4; i++) + q[i] = _mm_setzero_si128(); + + /* Encrypt up to four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[i]; + in += nbytes % (4*16); + out += nbytes % (4*16); + nbytes -= nbytes % (4*16); + if (nbytes == 0) + goto out; + } + + do { + KASSERT(nbytes % 64 == 0); + KASSERT(nbytes >= 64); + + /* Load up the tweaked inputs. */ + for (i = 0; i < 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + + /* Encrypt four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[4]; + in += 64; + out += 64; + nbytes -= 64; + } while (nbytes); + +out: /* Store the updated tweak. */ + _mm_storeu_epi8(tweak, t[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + explicit_memset(t, 0, sizeof t); +} + +void +aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + __m128i q[4]; + __m128i w; + __m128i t[5]; + unsigned i; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load tweak. */ + t[0] = _mm_loadu_epi8(tweak); + + /* Handle the first block separately if odd number. */ + if (nbytes % (4*16)) { + /* Load up the tweaked inputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + for (; i < 4; i++) + q[i] = _mm_setzero_si128(); + + /* Decrypt up to four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < (nbytes/16) % 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[i]; + in += nbytes % (4*16); + out += nbytes % (4*16); + nbytes -= nbytes % (4*16); + if (nbytes == 0) + goto out; + } + + do { + KASSERT(nbytes % 64 == 0); + KASSERT(nbytes >= 64); + + /* Load up the tweaked inputs. */ + for (i = 0; i < 4; i++) { + w = _mm_loadu_epi8(in + 16*i) ^ t[i]; + q[i] = aes_sse2_interleave_in(w); + t[i + 1] = aes_sse2_xts_update(t[i]); + } + + /* Decrypt four blocks. */ + aes_sse2_ortho(q); + aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_ortho(q); + + /* Store the tweaked outputs. */ + for (i = 0; i < 4; i++) { + w = aes_sse2_interleave_out(q[i]); + _mm_storeu_epi8(out + 16*i, w ^ t[i]); + } + + /* Advance to the next block. */ + t[0] = t[4]; + in += 64; + out += 64; + nbytes -= 64; + } while (nbytes); + +out: /* Store the updated tweak. */ + _mm_storeu_epi8(tweak, t[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + explicit_memset(t, 0, sizeof t); +} + +int +aes_sse2_selftest(void) +{ + + if (aes_sse2_xts_update_selftest()) + return -1; + + /* XXX test aes_sse2_bitslice_decrypt */ + /* XXX test aes_sse2_bitslice_encrypt */ + /* XXX test aes_sse2_keysched */ + /* XXX test aes_sse2_ortho */ + /* XXX test aes_sse2_skey_expand */ + + return 0; +}