When available, use the AArch64 AES instructions to implement the x86 ones. These are not a 1:1 fit, but considerably more efficient, and without data dependent timing.
For a typical benchmark (linux tcrypt mode=500), this gives a 2-3x speedup when running on ThunderX2. Signed-off-by: Ard Biesheuvel <a...@kernel.org> --- host/include/aarch64/host/cpuinfo.h | 1 + target/i386/ops_sse.h | 69 ++++++++++++++++++++ util/cpuinfo-aarch64.c | 1 + 3 files changed, 71 insertions(+) diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h index 82227890b4b4db03..05feeb4f4369fc19 100644 --- a/host/include/aarch64/host/cpuinfo.h +++ b/host/include/aarch64/host/cpuinfo.h @@ -9,6 +9,7 @@ #define CPUINFO_ALWAYS (1u << 0) /* so cpuinfo is nonzero */ #define CPUINFO_LSE (1u << 1) #define CPUINFO_LSE2 (1u << 2) +#define CPUINFO_AES (1u << 3) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index fb63af7afa21588d..db79132778efd211 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -20,6 +20,11 @@ #include "crypto/aes.h" +#ifdef __aarch64__ +#include "host/cpuinfo.h" +typedef uint8_t aes_vec_t __attribute__((vector_size(16))); +#endif + #if SHIFT == 0 #define Reg MMXReg #define XMM_ONLY(...) @@ -2165,6 +2170,20 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) Reg st = *v; Reg rk = *s; +#ifdef __aarch64__ + if (cpuinfo & CPUINFO_AES) { + asm(" .arch_extension aes \n" + " aesd %0.16b, %1.16b \n" + " aesimc %0.16b, %0.16b \n" + " eor %0.16b, %0.16b, %2.16b \n" + : "=w"(*(aes_vec_t *)d) + : "w"((aes_vec_t){}), + "w"(*(aes_vec_t *)s), + "0"(*(aes_vec_t *)v)); + return; + } +#endif + for (i = 0 ; i < 2 << SHIFT ; i++) { int j = i & 3; d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^ @@ -2180,6 +2199,19 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) Reg st = *v; Reg rk = *s; +#ifdef __aarch64__ + if (cpuinfo & CPUINFO_AES) { + asm(" .arch_extension aes \n" + " aesd %0.16b, %1.16b \n" + " eor %0.16b, %0.16b, %2.16b \n" + : "=w"(*(aes_vec_t *)d) + : "w"((aes_vec_t){}), + "w"(*(aes_vec_t *)s), + "0"(*(aes_vec_t *)v)); + return; + } +#endif + for (i = 0; i < 8 << SHIFT; i++) { d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]); } @@ -2191,6 +2223,20 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) Reg st = *v; Reg rk = *s; +#ifdef __aarch64__ + if (cpuinfo & CPUINFO_AES) { + asm(" .arch_extension aes \n" + " aese %0.16b, %1.16b \n" + " aesmc %0.16b, %0.16b \n" + " eor %0.16b, %0.16b, %2.16b \n" + : "=w"(*(aes_vec_t *)d) + : "w"((aes_vec_t){}), + "w"(*(aes_vec_t *)s), + "0"(*(aes_vec_t *)v)); + return; + } +#endif + for (i = 0 ; i < 2 << SHIFT ; i++) { int j = i & 3; d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^ @@ -2206,6 +2252,19 @@ void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) Reg st = *v; Reg rk = *s; +#ifdef __aarch64__ + if (cpuinfo & CPUINFO_AES) { + asm(" .arch_extension aes \n" + " aese %0.16b, %1.16b \n" + " eor %0.16b, %0.16b, %2.16b \n" + : "=w"(*(aes_vec_t *)d) + : "w"((aes_vec_t){}), + "w"(*(aes_vec_t *)s), + "0"(*(aes_vec_t *)v)); + return; + } +#endif + for (i = 0; i < 8 << SHIFT; i++) { d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]); } @@ -2217,6 +2276,16 @@ void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) int i; Reg tmp = *s; +#ifdef __aarch64__ + if (cpuinfo & CPUINFO_AES) { + asm(" .arch_extension aes \n" + " aesimc %0.16b, %1.16b \n" + : "=w"(*(aes_vec_t *)d) + : "w"(*(aes_vec_t *)s)); + return; + } +#endif + for (i = 0 ; i < 4 ; i++) { d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ AES_imc[tmp.B(4 * i + 1)][1] ^ diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c index f99acb788454e5ab..769cdfeb2fc32d5e 100644 --- a/util/cpuinfo-aarch64.c +++ b/util/cpuinfo-aarch64.c @@ -56,6 +56,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void) unsigned long hwcap = qemu_getauxval(AT_HWCAP); info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0); info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0); + info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0); #endif #ifdef CONFIG_DARWIN info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE; -- 2.39.2