Module Name: src Committed By: riastradh Date: Mon Jun 29 23:41:35 UTC 2020
Modified Files: src/sys/crypto/aes/arch/x86: aes_via.c Log Message: VIA AES: Batch AES-XTS computation into eight blocks at a time. Experimental -- performance improvement is not clearly worth the complexity. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/x86/aes_via.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/x86/aes_via.c diff -u src/sys/crypto/aes/arch/x86/aes_via.c:1.1 src/sys/crypto/aes/arch/x86/aes_via.c:1.2 --- src/sys/crypto/aes/arch/x86/aes_via.c:1.1 Mon Jun 29 23:39:30 2020 +++ src/sys/crypto/aes/arch/x86/aes_via.c Mon Jun 29 23:41:35 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_via.c,v 1.1 2020/06/29 23:39:30 riastradh Exp $ */ +/* $NetBSD: aes_via.c,v 1.2 2020/06/29 23:41:35 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.1 2020/06/29 23:39:30 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.2 2020/06/29 23:41:35 riastradh Exp $"); #include <sys/types.h> #include <sys/evcnt.h> @@ -119,8 +119,8 @@ aesvia_setdeckey(struct aesdec *dec, con } static inline void -aesvia_enc1(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], uint32_t cw0) +aesvia_encN(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nblocks, uint32_t cw0) { const uint32_t cw[4] __aligned(16) = { [0] = (cw0 @@ -128,7 +128,6 @@ aesvia_enc1(const struct aesenc *enc, co | C3_CRYPT_CWLO_ENCRYPT | C3_CRYPT_CWLO_NORMAL), }; - size_t nblocks = 1; KASSERT(((uintptr_t)enc & 0xf) == 0); KASSERT(((uintptr_t)in & 0xf) == 0); @@ -141,8 +140,8 @@ aesvia_enc1(const struct aesenc *enc, co } static inline void -aesvia_dec1(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], uint32_t cw0) +aesvia_decN(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nblocks, uint32_t cw0) { const uint32_t cw[4] __aligned(16) = { [0] = (cw0 @@ -150,7 +149,6 @@ aesvia_dec1(const struct aesdec *dec, co | C3_CRYPT_CWLO_DECRYPT | C3_CRYPT_CWLO_NORMAL), }; - size_t nblocks = 1; KASSERT(((uintptr_t)dec & 0xf) == 0); KASSERT(((uintptr_t)in & 0xf) == 0); @@ -180,7 +178,7 @@ aesvia_enc(const struct aesenc *enc, con if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 && ((uintptr_t)in & 0xff0) != 0xff0) { enc_aligned_evcnt.ev_count++; - aesvia_enc1(enc, in, out, cw0); + aesvia_encN(enc, in, out, 1, cw0); } else { enc_unaligned_evcnt.ev_count++; /* @@ -194,7 +192,7 @@ aesvia_enc(const struct aesenc *enc, con uint8_t outbuf[16] __aligned(16); memcpy(inbuf, in, 16); - aesvia_enc1(enc, inbuf, outbuf, cw0); + aesvia_encN(enc, inbuf, outbuf, 1, cw0); memcpy(out, outbuf, 16); explicit_memset(inbuf, 0, sizeof inbuf); @@ -221,7 +219,7 @@ aesvia_dec(const struct aesdec *dec, con if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 && ((uintptr_t)in & 0xff0) != 0xff0) { dec_aligned_evcnt.ev_count++; - aesvia_dec1(dec, in, out, cw0); + aesvia_decN(dec, in, out, 1, cw0); } else { dec_unaligned_evcnt.ev_count++; /* @@ -235,7 +233,7 @@ aesvia_dec(const struct aesdec *dec, con uint8_t outbuf[16] __aligned(16); memcpy(inbuf, in, 16); - aesvia_dec1(dec, inbuf, outbuf, cw0); + aesvia_decN(dec, inbuf, outbuf, 1, cw0); memcpy(out, outbuf, 16); explicit_memset(inbuf, 0, sizeof inbuf); @@ -245,7 +243,7 @@ aesvia_dec(const struct aesdec *dec, con } static inline void -aesvia_cbc_enc1(const struct aesenc *enc, const uint8_t in[static 16], +aesvia_cbc_encN(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nblocks, uint8_t **ivp, uint32_t cw0) { const uint32_t cw[4] __aligned(16) = { @@ -274,7 +272,7 @@ aesvia_cbc_enc1(const struct aesenc *enc } static inline void -aesvia_cbc_dec1(const struct aesdec *dec, const uint8_t in[static 16], +aesvia_cbc_decN(const struct aesdec *dec, const uint8_t in[static 16], uint8_t out[static 16], size_t nblocks, uint8_t iv[static 16], uint32_t cw0) { @@ -340,7 +338,7 @@ aesvia_cbc_enc(const struct aesenc *enc, if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) { cbcenc_aligned_evcnt.ev_count++; uint8_t *ivp = iv; - aesvia_cbc_enc1(enc, in, out, nbytes/16, &ivp, cw0); + aesvia_cbc_encN(enc, in, out, nbytes/16, &ivp, cw0); memcpy(iv, ivp, 16); } else { cbcenc_unaligned_evcnt.ev_count++; @@ -351,7 +349,7 @@ aesvia_cbc_enc(const struct aesenc *enc, for (; nbytes; nbytes -= 16, in += 16, out += 16) { memcpy(tmp, in, 16); xor128(tmp, tmp, cv); - aesvia_enc1(enc, tmp, cv, cw0); + aesvia_encN(enc, tmp, cv, 1, cw0); memcpy(out, cv, 16); } memcpy(iv, cv, 16); @@ -381,7 +379,7 @@ aesvia_cbc_dec(const struct aesdec *dec, aesvia_reload_keys(); if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) { cbcdec_aligned_evcnt.ev_count++; - aesvia_cbc_dec1(dec, in, out, nbytes/16, iv, cw0); + aesvia_cbc_decN(dec, in, out, nbytes/16, iv, cw0); } else { cbcdec_unaligned_evcnt.ev_count++; uint8_t iv0[16] __aligned(16); @@ -393,7 +391,7 @@ aesvia_cbc_dec(const struct aesdec *dec, memcpy(iv, cv, 16); for (;;) { - aesvia_dec1(dec, cv, tmp, cw0); + aesvia_decN(dec, cv, tmp, 1, cw0); if ((nbytes -= 16) == 0) break; memcpy(cv, in + nbytes - 16, 16); @@ -480,6 +478,7 @@ aesvia_xts_enc(const struct aesenc *enc, if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) { xtsenc_aligned_evcnt.ev_count++; unsigned lastblock = 0; + uint32_t buf[8*4] __aligned(16); /* * Make sure the last block is not the last block of a @@ -491,20 +490,43 @@ aesvia_xts_enc(const struct aesenc *enc, lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0); nbytes -= lastblock; - for (; nbytes; nbytes -= 16, in += 16, out += 16) { - xor128(out, in, t); - aesvia_enc1(enc, out, out, cw0); - xor128(out, out, t); - aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); + /* + * Handle an odd number of initial blocks so we can + * process the rest in eight-block (128-byte) chunks. + */ + if (nbytes % 128) { + unsigned nbytes128 = nbytes % 128; + + nbytes -= nbytes128; + for (; nbytes128; nbytes128 -= 16, in += 16, out += 16) + { + xor128(out, in, t); + aesvia_encN(enc, out, out, 1, cw0); + xor128(out, out, t); + aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); + } + } + + /* Process eight blocks at a time. */ + for (; nbytes; nbytes -= 128, in += 128, out += 128) { + unsigned i; + for (i = 0; i < 8; i++) { + memcpy(buf + 4*i, t, 16); + xor128(out + 4*i, in + 4*i, t); + aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); + } + aesvia_encN(enc, out, out, 8, cw0); + for (i = 0; i < 8; i++) + xor128(out + 4*i, in + 4*i, buf + 4*i); } /* Handle the last block of a page, if necessary. */ if (lastblock) { - uint8_t buf[16] __aligned(16); xor128(buf, in, t); - aesvia_enc1(enc, buf, out, cw0); - explicit_memset(buf, 0, sizeof buf); + aesvia_encN(enc, (const void *)buf, out, 1, cw0); } + + explicit_memset(buf, 0, sizeof buf); } else { xtsenc_unaligned_evcnt.ev_count++; uint8_t buf[16] __aligned(16); @@ -512,7 +534,7 @@ aesvia_xts_enc(const struct aesenc *enc, for (; nbytes; nbytes -= 16, in += 16, out += 16) { memcpy(buf, in, 16); xor128(buf, buf, t); - aesvia_enc1(enc, buf, buf, cw0); + aesvia_encN(enc, buf, buf, 1, cw0); xor128(buf, buf, t); memcpy(out, buf, 16); aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); @@ -550,6 +572,7 @@ aesvia_xts_dec(const struct aesdec *dec, if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) { xtsdec_aligned_evcnt.ev_count++; unsigned lastblock = 0; + uint32_t buf[8*4] __aligned(16); /* * Make sure the last block is not the last block of a @@ -561,20 +584,43 @@ aesvia_xts_dec(const struct aesdec *dec, lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0); nbytes -= lastblock; - for (; nbytes; nbytes -= 16, in += 16, out += 16) { - xor128(out, in, t); - aesvia_dec1(dec, out, out, cw0); - xor128(out, out, t); - aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); + /* + * Handle an odd number of initial blocks so we can + * process the rest in eight-block (128-byte) chunks. + */ + if (nbytes % 128) { + unsigned nbytes128 = nbytes % 128; + + nbytes -= nbytes128; + for (; nbytes128; nbytes128 -= 16, in += 16, out += 16) + { + xor128(out, in, t); + aesvia_decN(dec, out, out, 1, cw0); + xor128(out, out, t); + aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); + } + } + + /* Process eight blocks at a time. */ + for (; nbytes; nbytes -= 128, in += 128, out += 128) { + unsigned i; + for (i = 0; i < 8; i++) { + memcpy(buf + 4*i, t, 16); + xor128(out + 4*i, in + 4*i, t); + aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]); + } + aesvia_decN(dec, out, out, 8, cw0); + for (i = 0; i < 8; i++) + xor128(out + 4*i, in + 4*i, buf + 4*i); } /* Handle the last block of a page, if necessary. */ if (lastblock) { - uint8_t buf[16] __aligned(16); xor128(buf, in, t); - aesvia_dec1(dec, buf, out, cw0); - explicit_memset(buf, 0, sizeof buf); + aesvia_decN(dec, (const void *)buf, out, 1, cw0); } + + explicit_memset(buf, 0, sizeof buf); } else { xtsdec_unaligned_evcnt.ev_count++; uint8_t buf[16] __aligned(16); @@ -582,7 +628,7 @@ aesvia_xts_dec(const struct aesdec *dec, for (; nbytes; nbytes -= 16, in += 16, out += 16) { memcpy(buf, in, 16); xor128(buf, buf, t); - aesvia_dec1(dec, buf, buf, cw0); + aesvia_decN(dec, buf, buf, 1, cw0); xor128(buf, buf, t); memcpy(out, buf, 16); aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);