When available, use the AArch64 AES instructions to implement the x86
ones. These are not a 1:1 fit, but considerably more efficient, and
without data dependent timing.

For a typical benchmark (linux tcrypt mode=500), this gives a 2-3x
speedup when running on ThunderX2.

Signed-off-by: Ard Biesheuvel <a...@kernel.org>
---
 host/include/aarch64/host/cpuinfo.h |  1 +
 target/i386/ops_sse.h               | 69 ++++++++++++++++++++
 util/cpuinfo-aarch64.c              |  1 +
 3 files changed, 71 insertions(+)

diff --git a/host/include/aarch64/host/cpuinfo.h 
b/host/include/aarch64/host/cpuinfo.h
index 82227890b4b4db03..05feeb4f4369fc19 100644
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -9,6 +9,7 @@
 #define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
 #define CPUINFO_LSE             (1u << 1)
 #define CPUINFO_LSE2            (1u << 2)
+#define CPUINFO_AES             (1u << 3)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index fb63af7afa21588d..db79132778efd211 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -20,6 +20,11 @@
 
 #include "crypto/aes.h"
 
+#ifdef __aarch64__
+#include "host/cpuinfo.h"
+typedef uint8_t aes_vec_t __attribute__((vector_size(16)));
+#endif
+
 #if SHIFT == 0
 #define Reg MMXReg
 #define XMM_ONLY(...)
@@ -2165,6 +2170,20 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *v, Reg *s)
     Reg st = *v;
     Reg rk = *s;
 
+#ifdef __aarch64__
+    if (cpuinfo & CPUINFO_AES) {
+        asm("   .arch_extension aes             \n"
+            "   aesd    %0.16b, %1.16b          \n"
+            "   aesimc  %0.16b, %0.16b          \n"
+            "   eor     %0.16b, %0.16b, %2.16b  \n"
+            :   "=w"(*(aes_vec_t *)d)
+            :   "w"((aes_vec_t){}),
+                "w"(*(aes_vec_t *)s),
+                "0"(*(aes_vec_t *)v));
+        return;
+    }
+#endif
+
     for (i = 0 ; i < 2 << SHIFT ; i++) {
         int j = i & 3;
         d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
@@ -2180,6 +2199,19 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *v, Reg *s)
     Reg st = *v;
     Reg rk = *s;
 
+#ifdef __aarch64__
+    if (cpuinfo & CPUINFO_AES) {
+        asm("   .arch_extension aes             \n"
+            "   aesd    %0.16b, %1.16b          \n"
+            "   eor     %0.16b, %0.16b, %2.16b  \n"
+            :   "=w"(*(aes_vec_t *)d)
+            :   "w"((aes_vec_t){}),
+                "w"(*(aes_vec_t *)s),
+                "0"(*(aes_vec_t *)v));
+        return;
+    }
+#endif
+
     for (i = 0; i < 8 << SHIFT; i++) {
         d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
     }
@@ -2191,6 +2223,20 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *v, Reg *s)
     Reg st = *v;
     Reg rk = *s;
 
+#ifdef __aarch64__
+    if (cpuinfo & CPUINFO_AES) {
+        asm("   .arch_extension aes             \n"
+            "   aese    %0.16b, %1.16b          \n"
+            "   aesmc   %0.16b, %0.16b          \n"
+            "   eor     %0.16b, %0.16b, %2.16b  \n"
+            :   "=w"(*(aes_vec_t *)d)
+            :   "w"((aes_vec_t){}),
+                "w"(*(aes_vec_t *)s),
+                "0"(*(aes_vec_t *)v));
+        return;
+    }
+#endif
+
     for (i = 0 ; i < 2 << SHIFT ; i++) {
         int j = i & 3;
         d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
@@ -2206,6 +2252,19 @@ void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *v, Reg *s)
     Reg st = *v;
     Reg rk = *s;
 
+#ifdef __aarch64__
+    if (cpuinfo & CPUINFO_AES) {
+        asm("   .arch_extension aes             \n"
+            "   aese    %0.16b, %1.16b          \n"
+            "   eor     %0.16b, %0.16b, %2.16b  \n"
+            :   "=w"(*(aes_vec_t *)d)
+            :   "w"((aes_vec_t){}),
+                "w"(*(aes_vec_t *)s),
+                "0"(*(aes_vec_t *)v));
+        return;
+    }
+#endif
+
     for (i = 0; i < 8 << SHIFT; i++) {
         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
     }
@@ -2217,6 +2276,16 @@ void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s)
     int i;
     Reg tmp = *s;
 
+#ifdef __aarch64__
+    if (cpuinfo & CPUINFO_AES) {
+        asm("   .arch_extension aes             \n"
+            "   aesimc  %0.16b, %1.16b          \n"
+            :   "=w"(*(aes_vec_t *)d)
+            :   "w"(*(aes_vec_t *)s));
+        return;
+    }
+#endif
+
     for (i = 0 ; i < 4 ; i++) {
         d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
                           AES_imc[tmp.B(4 * i + 1)][1] ^
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
index f99acb788454e5ab..769cdfeb2fc32d5e 100644
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@@ -56,6 +56,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
     unsigned long hwcap = qemu_getauxval(AT_HWCAP);
     info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
     info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
+    info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
 #endif
 #ifdef CONFIG_DARWIN
     info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
-- 
2.39.2


Reply via email to