Module Name: src Committed By: riastradh Date: Tue Jul 28 20:11:09 UTC 2020
Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon_impl.h aes_neon_subr.c arm_neon.h Log Message: Draft 2x vectorized neon vpaes for aarch64. Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in cgd tests, for parallelizable operations like CBC decryption; same improvement should probably carry over to rpi4 CPU which lacks ARMv8.0-AES. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon.c diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon.c:1.4 --- src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 Tue Jun 30 20:32:11 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.c Tue Jul 28 20:11:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */ +/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,7 +39,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); #include <sys/types.h> @@ -589,6 +589,59 @@ aes_neon_enc1(const struct aesenc *enc, return vqtbl1q_u8(x, sr[rmod4]); } +uint8x16x2_t +aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds) +{ + const uint32_t *rk32 = enc->aese_aes.aes_rk; + uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; + uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; + uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; + uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; + uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; + uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; + uint8x16_t x0 = x.val[0], x1 = x.val[1]; + uint8x16_t io0, jo0, io1, jo1; + unsigned rmod4 = 0; + + x0 = aes_schedule_transform(x0, ipt); + x1 = aes_schedule_transform(x1, ipt); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + for (;;) { + uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0; + uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1; + + subbytes(&io0, &jo0, x0, inv_, inva_); + subbytes(&io1, &jo1, x1, inv_, inva_); + + rk32 += 4; + rmod4 = (rmod4 + 1) % 4; + if (--nrounds == 0) + break; + + A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0); + A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1); + A_0 ^= loadroundkey(rk32); + A_1 ^= loadroundkey(rk32); + A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0); + A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1); + A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]); + A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]); + A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]); + A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]); + x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]); + x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]); + } + x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0); + x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + return (uint8x16x2_t) { .val = { + [0] = vqtbl1q_u8(x0, sr[rmod4]), + [1] = vqtbl1q_u8(x1, sr[rmod4]), + } }; +} + uint8x16_t aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) { @@ -628,4 +681,60 @@ aes_neon_dec1(const struct aesdec *dec, return vqtbl1q_u8(x, sr[i]); } +uint8x16x2_t +aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds) +{ + const uint32_t *rk32 = dec->aesd_aes.aes_rk; + unsigned i = 3 & ~(nrounds - 1); + uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; + uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; + uint8x16_t x0 = x.val[0], x1 = x.val[1]; + uint8x16_t io0, jo0, io1, jo1, mc; + + x0 = aes_schedule_transform(x0, dipt); + x1 = aes_schedule_transform(x1, dipt); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + rk32 += 4; + + mc = mc_forward[3]; + for (;;) { + subbytes(&io0, &jo0, x0, inv_, inva_); + subbytes(&io1, &jo1, x1, inv_, inva_); + if (--nrounds == 0) + break; + + x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0); + x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + rk32 += 4; /* next round key */ + + x0 = vqtbl1q_u8(x0, mc); + x1 = vqtbl1q_u8(x1, mc); + x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0); + x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1); + + x0 = vqtbl1q_u8(x0, mc); + x1 = vqtbl1q_u8(x1, mc); + x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0); + x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1); + + x0 = vqtbl1q_u8(x0, mc); + x1 = vqtbl1q_u8(x1, mc); + x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0); + x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1); + + mc = vextq_u8(mc, mc, 12); + } + x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0); + x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + return (uint8x16x2_t) { .val = { + [0] = vqtbl1q_u8(x0, sr[i]), + [1] = vqtbl1q_u8(x1, sr[i]), + } }; +} + #endif Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.4 --- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3 Sat Jul 25 22:36:06 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Tue Jul 28 20:11:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ +/* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); #include <sys/endian.h> @@ -111,14 +111,33 @@ aes_neon_cbc_dec(const struct aesdec *de cv = loadblock(in + nbytes - 16); storeblock(iv, cv); - for (;;) { + if (nbytes % 32) { + KASSERT(nbytes % 32 == 16); b = aes_neon_dec1(dec, cv, nrounds); if ((nbytes -= 16) == 0) - break; + goto out; + cv = loadblock(in + nbytes - 16); + storeblock(out + nbytes, cv ^ b); + } + + for (;;) { + uint8x16x2_t b2; + + KASSERT(nbytes >= 32); + + b2.val[1] = cv; + b2.val[0] = cv = loadblock(in + nbytes - 32); + b2 = aes_neon_dec2(dec, b2, nrounds); + storeblock(out + nbytes - 16, cv ^ b2.val[1]); + if ((nbytes -= 32) == 0) { + b = b2.val[0]; + goto out; + } cv = loadblock(in + nbytes - 16); - storeblock(out + nbytes, b ^ cv); + storeblock(out + nbytes, cv ^ b2.val[0]); } - storeblock(out, b ^ iv0); + +out: storeblock(out, b ^ iv0); } static inline uint8x16_t @@ -186,11 +205,28 @@ aes_neon_xts_enc(const struct aesenc *en KASSERT(nbytes % 16 == 0); t = loadblock(tweak); - for (; nbytes; nbytes -= 16, in += 16, out += 16) { + if (nbytes % 32) { + KASSERT(nbytes % 32 == 16); b = t ^ loadblock(in); b = aes_neon_enc1(enc, b, nrounds); storeblock(out, t ^ b); t = aes_neon_xts_update(t); + nbytes -= 16; + in += 16; + out += 16; + } + for (; nbytes; nbytes -= 32, in += 32, out += 32) { + uint8x16_t t1; + uint8x16x2_t b2; + + t1 = aes_neon_xts_update(t); + b2.val[0] = t ^ loadblock(in); + b2.val[1] = t1 ^ loadblock(in + 16); + b2 = aes_neon_enc2(enc, b2, nrounds); + storeblock(out, b2.val[0] ^ t); + storeblock(out + 16, b2.val[1] ^ t1); + + t = aes_neon_xts_update(t1); } storeblock(tweak, t); } @@ -206,11 +242,28 @@ aes_neon_xts_dec(const struct aesdec *de KASSERT(nbytes % 16 == 0); t = loadblock(tweak); - for (; nbytes; nbytes -= 16, in += 16, out += 16) { + if (nbytes % 32) { + KASSERT(nbytes % 32 == 16); b = t ^ loadblock(in); b = aes_neon_dec1(dec, b, nrounds); storeblock(out, t ^ b); t = aes_neon_xts_update(t); + nbytes -= 16; + in += 16; + out += 16; + } + for (; nbytes; nbytes -= 32, in += 32, out += 32) { + uint8x16_t t1; + uint8x16x2_t b2; + + t1 = aes_neon_xts_update(t); + b2.val[0] = t ^ loadblock(in); + b2.val[1] = t1 ^ loadblock(in + 16); + b2 = aes_neon_dec2(dec, b2, nrounds); + storeblock(out, b2.val[0] ^ t); + storeblock(out + 16, b2.val[1] ^ t1); + + t = aes_neon_xts_update(t1); } storeblock(tweak, t); } @@ -262,11 +315,16 @@ aes_neon_ccm_enc1(const struct aesenc *e ctr_be = loadblock(authctr + 16); ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); for (; nbytes; nbytes -= 16, in += 16, out += 16) { + uint8x16x2_t b2; ptxt = loadblock(in); - auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); ctr = vaddq_u32(ctr, ctr32_inc); ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); - storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds)); + + b2.val[0] = auth ^ ptxt; + b2.val[1] = ctr_be; + b2 = aes_neon_enc2(enc, b2, nrounds); + auth = b2.val[0]; + storeblock(out, ptxt ^ b2.val[1]); } storeblock(authctr, auth); storeblock(authctr + 16, ctr_be); @@ -278,22 +336,37 @@ aes_neon_ccm_dec1(const struct aesenc *e uint32_t nrounds) { const uint32x4_t ctr32_inc = {0, 0, 0, 1}; - uint8x16_t auth, ctr_be, ptxt; + uint8x16_t auth, ctr_be, ptxt, pad; uint32x4_t ctr; KASSERT(nbytes); KASSERT(nbytes % 16 == 0); - auth = loadblock(authctr); ctr_be = loadblock(authctr + 16); ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); - for (; nbytes; nbytes -= 16, in += 16, out += 16) { + ctr = vaddq_u32(ctr, ctr32_inc); + ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + pad = aes_neon_enc1(enc, ctr_be, nrounds); + auth = loadblock(authctr); + for (;; in += 16, out += 16) { + uint8x16x2_t b2; + + ptxt = loadblock(in) ^ pad; + auth ^= ptxt; + storeblock(out, ptxt); + + if ((nbytes -= 16) == 0) + break; + ctr = vaddq_u32(ctr, ctr32_inc); ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); - ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds); - storeblock(out, ptxt); - auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); + b2.val[0] = auth; + b2.val[1] = ctr_be; + b2 = aes_neon_enc2(enc, b2, nrounds); + auth = b2.val[0]; + pad = b2.val[1]; } + auth = aes_neon_enc1(enc, auth, nrounds); storeblock(authctr, auth); storeblock(authctr + 16, ctr_be); } Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.h diff -u src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.1 src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.2 --- src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.1 Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h Tue Jul 28 20:11:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ +/* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,4 +39,33 @@ uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); +#ifdef __aarch64__ + +uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned); +uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned); + +#else + +static inline uint8x16x2_t +aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds) +{ + + return (uint8x16x2_t) { .val = { + [0] = aes_neon_enc1(enc, b2.val[0], nrounds), + [1] = aes_neon_enc1(enc, b2.val[1], nrounds), + } }; +} + +static inline uint8x16x2_t +aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds) +{ + + return (uint8x16x2_t) { .val = { + [0] = aes_neon_dec1(dec, b2.val[0], nrounds), + [1] = aes_neon_dec1(dec, b2.val[1], nrounds), + } }; +} + +#endif + #endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.6 src/sys/crypto/aes/arch/arm/arm_neon.h:1.7 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.6 Sat Jul 25 22:43:01 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Tue Jul 28 20:11:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -42,6 +42,7 @@ typedef __Int8x16_t int8x16_t; typedef __Uint32x4_t uint32x4_t; typedef __Uint64x2_t uint64x2_t; typedef __Uint8x16_t uint8x16_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #else typedef __simd128_int32_t int32x4_t; typedef __simd128_int64_t int64x2_t; @@ -54,6 +55,7 @@ typedef __simd64_int8_t int8x8_t; typedef __simd64_uint8_t uint8x8_t; typedef __builtin_neon_udi uint64x1_t; typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #endif #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)