Use the AArch64 PMULL{2}.P64 instructions to implement PCLMULQDQ instead of emulating them in C code if the host supports this. This is used in the implementation of GCM, which is widely used in IPsec VPN and HTTPS.
Somewhat surprising results: on my ThunderX2, enabling this on top of the AES acceleration I sent out earlier, the speedup is substantial. (1420 is a typical IPsec block size - in HTTPS, GCM operates on much larger block sizes but the kernel mode benchmarks are not the best place to measure its performance in this mode) tcrypt: testing speed of rfc4106(gcm(aes)) (rfc4106-gcm-aesni) encryption No acceleration tcrypt: test 5 (160 bit key, 1420 byte blocks): 10046 operations in 1 seconds (14265320 bytes) AES acceleration tcrypt: test 5 (160 bit key, 1420 byte blocks): 13970 operations in 1 seconds (19837400 bytes) AES + PMULL acceleration tcrypt: test 5 (160 bit key, 1420 byte blocks): 24372 operations in 1 seconds (34608240 bytes) Signed-off-by: Ard Biesheuvel <a...@kernel.org> --- host/include/aarch64/host/cpuinfo.h | 1 + target/i386/ops_sse.h | 24 ++++++++++++++++++++ util/cpuinfo-aarch64.c | 1 + 3 files changed, 26 insertions(+) diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h index 05feeb4f4369fc19..da268dce1390cac0 100644 --- a/host/include/aarch64/host/cpuinfo.h +++ b/host/include/aarch64/host/cpuinfo.h @@ -10,6 +10,7 @@ #define CPUINFO_LSE (1u << 1) #define CPUINFO_LSE2 (1u << 2) #define CPUINFO_AES (1u << 3) +#define CPUINFO_PMULL (1u << 4) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index db79132778efd211..d7e7bd8b733122a8 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2157,6 +2157,30 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint64_t a, b; int i; +#ifdef __aarch64__ + if (cpuinfo & CPUINFO_PMULL) { + aes_vec_t vv = *(aes_vec_t *)v, vs = *(aes_vec_t *)s; + aes_vec_t *vd = (aes_vec_t *)d; + + switch (ctrl & 0x11) { + case 0x1: + asm("ext %0.16b, %0.16b, %0.16b, #8":"+w"(vv)); + /* fallthrough */ + case 0x0: + asm(".arch_extension aes\n" + "pmull %0.1q, %1.1d, %2.1d":"=w"(*vd):"w"(vv),"w"(vs)); + break; + case 0x10: + asm("ext %0.16b, %0.16b, %0.16b, #8":"+w"(vv)); + /* fallthrough */ + case 0x11: + asm(".arch_extension aes\n" + "pmull2 %0.1q, %1.2d, %2.2d":"=w"(*vd):"w"(vv),"w"(vs)); + } + return; + } +#endif + for (i = 0; i < 1 << SHIFT; i += 2) { a = v->Q(((ctrl & 1) != 0) + i); b = s->Q(((ctrl & 16) != 0) + i); diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c index 769cdfeb2fc32d5e..95ec1f4adfc829b9 100644 --- a/util/cpuinfo-aarch64.c +++ b/util/cpuinfo-aarch64.c @@ -57,6 +57,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void) info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0); info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0); info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0); + info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0); #endif #ifdef CONFIG_DARWIN info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE; -- 2.39.2