Module Name:    src
Committed By:   riastradh
Date:           Mon Jun 29 23:31:42 UTC 2020

Modified Files:
        src/sys/arch/aarch64/aarch64: cpu.c
        src/sys/arch/aarch64/conf: files.aarch64
Added Files:
        src/sys/crypto/aes/arch/arm: aes_armv8.c aes_armv8.h aes_armv8_64.S
            files.aesarmv8

Log Message:
Implement AES in kernel using ARMv8.0-AES on aarch64.


To generate a diff of this commit:
cvs rdiff -u -r1.48 -r1.49 src/sys/arch/aarch64/aarch64/cpu.c
cvs rdiff -u -r1.22 -r1.23 src/sys/arch/aarch64/conf/files.aarch64
cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_armv8.c \
    src/sys/crypto/aes/arch/arm/aes_armv8.h \
    src/sys/crypto/aes/arch/arm/aes_armv8_64.S \
    src/sys/crypto/aes/arch/arm/files.aesarmv8

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/aarch64/aarch64/cpu.c
diff -u src/sys/arch/aarch64/aarch64/cpu.c:1.48 src/sys/arch/aarch64/aarch64/cpu.c:1.49
--- src/sys/arch/aarch64/aarch64/cpu.c:1.48	Mon Jun 29 23:22:27 2020
+++ src/sys/arch/aarch64/aarch64/cpu.c	Mon Jun 29 23:31:41 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.48 2020/06/29 23:22:27 riastradh Exp $ */
+/* $NetBSD: cpu.c,v 1.49 2020/06/29 23:31:41 riastradh Exp $ */
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org>
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.48 2020/06/29 23:22:27 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.49 2020/06/29 23:31:41 riastradh Exp $");
 
 #include "locators.h"
 #include "opt_arm_debug.h"
@@ -44,6 +44,8 @@ __KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.48
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
+#include <crypto/aes/arch/arm/aes_armv8.h>
+
 #include <aarch64/armreg.h>
 #include <aarch64/cpu.h>
 #include <aarch64/cpufunc.h>
@@ -70,6 +72,7 @@ static void cpu_init_counter(struct cpu_
 static void cpu_setup_id(struct cpu_info *);
 static void cpu_setup_sysctl(device_t, struct cpu_info *);
 static void cpu_setup_rng(device_t, struct cpu_info *);
+static void cpu_setup_aes(device_t, struct cpu_info *);
 
 #ifdef MULTIPROCESSOR
 #define NCPUINFO	MAXCPUS
@@ -158,6 +161,7 @@ cpu_attach(device_t dv, cpuid_t id)
 
 	cpu_setup_sysctl(dv, ci);
 	cpu_setup_rng(dv, ci);
+	cpu_setup_aes(dv, ci);
 }
 
 struct cpuidtab {
@@ -589,6 +593,26 @@ cpu_setup_rng(device_t dv, struct cpu_in
 	    RND_FLAG_DEFAULT|RND_FLAG_HASCB);
 }
 
+/*
+ * setup the AES implementation
+ */
+static void
+cpu_setup_aes(device_t dv, struct cpu_info *ci)
+{
+	struct aarch64_sysctl_cpu_id *id = &ci->ci_id;
+
+	/* Verify that it is supported.  */
+	switch (__SHIFTOUT(id->ac_aa64isar0, ID_AA64ISAR0_EL1_AES)) {
+	case ID_AA64ISAR0_EL1_AES_AES:
+	case ID_AA64ISAR0_EL1_AES_PMUL:
+		break;
+	default:
+		return;
+	}
+
+	aes_md_init(&aes_armv8_impl);
+}
+
 #ifdef MULTIPROCESSOR
 void
 cpu_hatch(struct cpu_info *ci)

Index: src/sys/arch/aarch64/conf/files.aarch64
diff -u src/sys/arch/aarch64/conf/files.aarch64:1.22 src/sys/arch/aarch64/conf/files.aarch64:1.23
--- src/sys/arch/aarch64/conf/files.aarch64:1.22	Sat Apr 18 11:00:37 2020
+++ src/sys/arch/aarch64/conf/files.aarch64	Mon Jun 29 23:31:41 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.aarch64,v 1.22 2020/04/18 11:00:37 skrll Exp $
+#	$NetBSD: files.aarch64,v 1.23 2020/06/29 23:31:41 riastradh Exp $
 
 defflag opt_cpuoptions.h	AARCH64_ALIGNMENT_CHECK
 defflag opt_cpuoptions.h	AARCH64_EL0_STACK_ALIGNMENT_CHECK
@@ -138,3 +138,6 @@ file	arch/aarch64/aarch64/netbsd32_sysca
 
 # profiling support
 file	dev/tprof/tprof_armv8.c			tprof	needs-flag
+
+# ARMv8.0-AES
+include "crypto/aes/arch/arm/files.aesarmv8"

Added files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8.c
diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_armv8.c:1.1
--- /dev/null	Mon Jun 29 23:31:42 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8.c	Mon Jun 29 23:31:41 2020
@@ -0,0 +1,259 @@
+/*	$NetBSD: aes_armv8.c,v 1.1 2020/06/29 23:31:41 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: aes_armv8.c,v 1.1 2020/06/29 23:31:41 riastradh Exp $");
+
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+
+#include <crypto/aes/aes.h>
+#include <crypto/aes/arch/arm/aes_armv8.h>
+
+#include <aarch64/armreg.h>
+#include <aarch64/fpu.h>
+
+static void
+aesarmv8_setenckey(struct aesenc *enc, const uint8_t key[static 16],
+    uint32_t nrounds)
+{
+
+	switch (nrounds) {
+	case 10:
+		aesarmv8_setenckey128(enc, key);
+		break;
+	case 12:
+		aesarmv8_setenckey192(enc, key);
+		break;
+	case 14:
+		aesarmv8_setenckey256(enc, key);
+		break;
+	default:
+		panic("invalid AES rounds: %u", nrounds);
+	}
+}
+
+static void
+aesarmv8_setenckey_impl(struct aesenc *enc, const uint8_t key[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aesarmv8_setenckey(enc, key, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aesarmv8_setdeckey_impl(struct aesdec *dec, const uint8_t key[static 16],
+    uint32_t nrounds)
+{
+	struct aesenc enc;
+
+	fpu_kern_enter();
+	aesarmv8_setenckey(&enc, key, nrounds);
+	aesarmv8_enctodec(&enc, dec, nrounds);
+	fpu_kern_leave();
+
+	explicit_memset(&enc, 0, sizeof enc);
+}
+
+static void
+aesarmv8_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aesarmv8_enc(enc, in, out, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aesarmv8_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aesarmv8_dec(dec, in, out, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aesarmv8_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+
+	KASSERT(nbytes % 16 == 0);
+
+	fpu_kern_enter();
+	aesarmv8_cbc_enc(enc, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aesarmv8_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+
+	KASSERT(nbytes % 16 == 0);
+
+	fpu_kern_enter();
+
+	if (nbytes % 128) {
+		aesarmv8_cbc_dec1(dec, in, out, nbytes % 128, iv, nrounds);
+		in += nbytes % 128;
+		out += nbytes % 128;
+		nbytes -= nbytes % 128;
+	}
+
+	KASSERT(nbytes % 128 == 0);
+	if (nbytes)
+		aesarmv8_cbc_dec8(dec, in, out, nbytes, iv, nrounds);
+
+	fpu_kern_leave();
+}
+
+static void
+aesarmv8_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+
+	KASSERT(nbytes % 16 == 0);
+
+	fpu_kern_enter();
+
+	if (nbytes % 128) {
+		aesarmv8_xts_enc1(enc, in, out, nbytes % 128, tweak, nrounds);
+		in += nbytes % 128;
+		out += nbytes % 128;
+		nbytes -= nbytes % 128;
+	}
+
+	KASSERT(nbytes % 128 == 0);
+	if (nbytes)
+		aesarmv8_xts_enc8(enc, in, out, nbytes, tweak, nrounds);
+
+	fpu_kern_leave();
+}
+
+static void
+aesarmv8_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+
+	KASSERT(nbytes % 16 == 0);
+
+	fpu_kern_enter();
+
+	if (nbytes % 128) {
+		aesarmv8_xts_dec1(dec, in, out, nbytes % 128, tweak, nrounds);
+		in += nbytes % 128;
+		out += nbytes % 128;
+		nbytes -= nbytes % 128;
+	}
+
+	KASSERT(nbytes % 128 == 0);
+	if (nbytes)
+		aesarmv8_xts_dec8(dec, in, out, nbytes, tweak, nrounds);
+
+	fpu_kern_leave();
+}
+
+static int
+aesarmv8_xts_update_selftest(void)
+{
+	static const struct {
+		uint8_t	in[16], out[16];
+	} cases[] = {
+		{{1}, {2}},
+		{{0,0,0,0x80}, {0,0,0,0,1}},
+		{{0,0,0,0,0,0,0,0x80}, {0,0,0,0,0,0,0,0,1}},
+		{{0,0,0,0x80,0,0,0,0x80}, {0,0,0,0,1,0,0,0,1}},
+		{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x80}, {0x87}},
+		{{0,0,0,0,0,0,0,0x80,0,0,0,0,0,0,0,0x80},
+		 {0x87,0,0,0,0,0,0,0,1}},
+		{{0,0,0,0x80,0,0,0,0,0,0,0,0,0,0,0,0x80}, {0x87,0,0,0,1}},
+		{{0,0,0,0x80,0,0,0,0x80,0,0,0,0,0,0,0,0x80},
+		 {0x87,0,0,0,1,0,0,0,1}},
+	};
+	unsigned i;
+	uint8_t tweak[16];
+
+	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
+		aesarmv8_xts_update(cases[i].in, tweak);
+		if (memcmp(tweak, cases[i].out, 16))
+			return -1;
+	}
+
+	/* Success!  */
+	return 0;
+}
+
+static int
+aesarmv8_probe(void)
+{
+	struct aarch64_sysctl_cpu_id *id;
+	int result = 0;
+
+	/* Verify that the CPU supports AES.  */
+	id = &curcpu()->ci_id;
+	switch (__SHIFTOUT(id->ac_aa64isar0, ID_AA64ISAR0_EL1_AES)) {
+	case ID_AA64ISAR0_EL1_AES_AES:
+	case ID_AA64ISAR0_EL1_AES_PMUL:
+		break;
+	default:
+		return -1;
+	}
+
+	fpu_kern_enter();
+
+	/* Verify that our XTS tweak update logic works.  */
+	if (aesarmv8_xts_update_selftest())
+		result = -1;
+
+	fpu_kern_leave();
+
+	return result;
+}
+
+struct aes_impl aes_armv8_impl = {
+	.ai_name = "ARMv8.0-AES",
+	.ai_probe = aesarmv8_probe,
+	.ai_setenckey = aesarmv8_setenckey_impl,
+	.ai_setdeckey = aesarmv8_setdeckey_impl,
+	.ai_enc = aesarmv8_enc_impl,
+	.ai_dec = aesarmv8_dec_impl,
+	.ai_cbc_enc = aesarmv8_cbc_enc_impl,
+	.ai_cbc_dec = aesarmv8_cbc_dec_impl,
+	.ai_xts_enc = aesarmv8_xts_enc_impl,
+	.ai_xts_dec = aesarmv8_xts_dec_impl,
+};
Index: src/sys/crypto/aes/arch/arm/aes_armv8.h
diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_armv8.h:1.1
--- /dev/null	Mon Jun 29 23:31:42 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8.h	Mon Jun 29 23:31:41 2020
@@ -0,0 +1,68 @@
+/*	$NetBSD: aes_armv8.h,v 1.1 2020/06/29 23:31:41 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_AES_ARCH_ARM_AES_ARMV8_H
+#define	_CRYPTO_AES_AES_ARCH_ARM_AES_ARMV8_H
+
+#include <sys/types.h>
+
+#include <crypto/aes/aes.h>
+
+/* Assembly routines */
+
+void	aesarmv8_setenckey128(struct aesenc *, const uint8_t[static 16]);
+void	aesarmv8_setenckey192(struct aesenc *, const uint8_t[static 24]);
+void	aesarmv8_setenckey256(struct aesenc *, const uint8_t[static 32]);
+
+void	aesarmv8_enctodec(const struct aesenc *, struct aesdec *, uint32_t);
+
+void	aesarmv8_enc(const struct aesenc *, const uint8_t[static 16],
+	    uint8_t[static 16], uint32_t);
+void	aesarmv8_dec(const struct aesdec *, const uint8_t[static 16],
+	    uint8_t[static 16], uint32_t);
+
+void	aesarmv8_cbc_enc(const struct aesenc *, const uint8_t[static 16],
+	    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void	aesarmv8_cbc_dec1(const struct aesdec *, const uint8_t[static 16],
+	    uint8_t[static 16], size_t, const uint8_t[static 16], uint32_t);
+void	aesarmv8_cbc_dec8(const struct aesdec *, const uint8_t[static 128],
+	    uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+
+void	aesarmv8_xts_enc1(const struct aesenc *, const uint8_t[static 16],
+	    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void	aesarmv8_xts_enc8(const struct aesenc *, const uint8_t[static 128],
+	    uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void	aesarmv8_xts_dec1(const struct aesdec *, const uint8_t[static 16],
+	    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void	aesarmv8_xts_dec8(const struct aesdec *, const uint8_t[static 128],
+	    uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void	aesarmv8_xts_update(const uint8_t[static 16], uint8_t[static 16]);
+
+extern struct aes_impl aes_armv8_impl;
+
+#endif	/* _CRYPTO_AES_AES_ARCH_ARM_AES_ARMV8_H */
Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.1
--- /dev/null	Mon Jun 29 23:31:42 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Mon Jun 29 23:31:41 2020
@@ -0,0 +1,1014 @@
+/*	$NetBSD: aes_armv8_64.S,v 1.1 2020/06/29 23:31:41 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <aarch64/asm.h>
+
+	.arch_extension	crypto
+
+/*
+ * uint32_t rcon[10]
+ *
+ *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
+ *	Such elements of GF(8) need only eight bits to be represented,
+ *	but we store them in 4-byte units so we can copy one into all
+ *	four 4-byte lanes of a vector register with a single LD1R.  The
+ *	access pattern is fixed, so indices into this table are never
+ *	secret.
+ */
+	.section .rodata
+	.align	4
+	.type	rcon,@object
+rcon:
+	.long	0x01
+	.long	0x02
+	.long	0x04
+	.long	0x08
+	.long	0x10
+	.long	0x20
+	.long	0x40
+	.long	0x80
+	.long	0x1b
+	.long	0x36
+END(rcon)
+
+/*
+ * uint128_t unshiftrows_rotword_1
+ *
+ *	Table for TBL instruction to undo ShiftRows, and then do
+ *	RotWord on word 1, and then copy it into all the other words.
+ */
+	.section .rodata
+	.align	16
+	.type	unshiftrows_rotword_1,@object
+unshiftrows_rotword_1:
+	.byte	0x01,0x0e,0x0b,0x04
+	.byte	0x01,0x0e,0x0b,0x04
+	.byte	0x01,0x0e,0x0b,0x04
+	.byte	0x01,0x0e,0x0b,0x04
+END(unshiftrows_rotword_1)
+
+/*
+ * uint128_t unshiftrows_3
+ *
+ *	Table for TBL instruction to undo ShiftRows, and then copy word
+ *	3 into all the other words.
+ */
+	.section .rodata
+	.align	16
+	.type	unshiftrows_3,@object
+unshiftrows_3:
+	.byte	0x0c,0x09,0x06,0x03
+	.byte	0x0c,0x09,0x06,0x03
+	.byte	0x0c,0x09,0x06,0x03
+	.byte	0x0c,0x09,0x06,0x03
+END(unshiftrows_3)
+
+/*
+ * uint128_t unshiftrows_rotword_3
+ *
+ *	Table for TBL instruction to undo ShiftRows, and then do
+ *	RotWord on word 3, and then copy it into all the other words.
+ */
+	.section .rodata
+	.align	16
+	.type	unshiftrows_rotword_3,@object
+unshiftrows_rotword_3:
+	.byte	0x09,0x06,0x03,0x0c
+	.byte	0x09,0x06,0x03,0x0c
+	.byte	0x09,0x06,0x03,0x0c
+	.byte	0x09,0x06,0x03,0x0c
+END(unshiftrows_rotword_3)
+
+/*
+ * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
+ *
+ *	Expand a 16-byte AES-128 key into 10 round keys.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_setenckey128)
+	ldr	q1, [x1]	/* q1 := master key */
+
+	adrl	x4, unshiftrows_rotword_3
+	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
+	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 table */
+
+	str	q1, [x0], #0x10	/* store master key as first round key */
+	mov	x2, #10		/* round count */
+	adrl	x3, rcon	/* round constant */
+
+1:	/*
+	 * q0 = 0
+	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
+	 * x0 = pointer to round key to compute
+	 * x2 = round count
+	 * x3 = rcon pointer
+	 */
+
+	/* q3 := ShiftRows(SubBytes(q1)) */
+	mov	v3.16b, v1.16b
+	aese	v3.16b, v0.16b
+
+	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
+	ld1r	{v4.4s}, [x3], #4
+	tbl	v3.16b, {v3.16b}, v8.16b
+	eor	v3.16b, v3.16b, v4.16b
+
+	/*
+	 * v5.4s := (0,prk[0],prk[1],prk[2])
+	 * v6.4s := (0,0,prk[0],prk[1])
+	 * v7.4s := (0,0,0,prk[0])
+	 */
+	ext	v5.16b, v0.16b, v1.16b, #12
+	ext	v6.16b, v0.16b, v1.16b, #8
+	ext	v7.16b, v0.16b, v1.16b, #4
+
+	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
+	eor	v1.16b, v1.16b, v3.16b
+	eor	v1.16b, v1.16b, v5.16b
+	eor	v1.16b, v1.16b, v6.16b
+	eor	v1.16b, v1.16b, v7.16b
+
+	subs	x2, x2, #1	/* count down rounds */
+	str	q1, [x0], #0x10	/* store round key */
+	b.ne	1b
+
+	ret
+END(aesarmv8_setenckey128)
+
+/*
+ * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
+ *
+ *	Expand a 24-byte AES-192 key into 12 round keys.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_setenckey192)
+	ldr	q1, [x1], #0x10	/* q1 := master key[0:128) */
+	ldr	d2, [x1]	/* d2 := master key[128:192) */
+
+	adrl	x4, unshiftrows_rotword_1
+	adrl	x5, unshiftrows_rotword_3
+	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
+	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_1 */
+	ldr	q9, [x5]	/* q9 := unshiftrows_rotword_3 */
+
+	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
+	mov	x2, #12		/* round count */
+	adrl	x3, rcon	/* round constant */
+
+1:	/*
+	 * q0 = 0
+	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
+	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
+	 * x0 = pointer to three round keys to compute
+	 * x2 = round count
+	 * x3 = rcon pointer
+	 */
+
+	/* q3 := ShiftRows(SubBytes(q2)) */
+	mov	v3.16b, v2.16b
+	aese	v3.16b, v0.16b
+
+	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
+	ld1r	{v4.4s}, [x3], #4
+	tbl	v3.16b, {v3.16b}, v8.16b
+	eor	v3.16b, v3.16b, v4.16b
+
+	/*
+	 * We need to compute:
+	 *
+	 * rk[0] := rklo[0]
+	 * rk[1] := rklo[1]
+	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
+	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
+	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
+	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
+	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
+	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
+	 *     ^ rklo[1]
+	 */
+
+	/*
+	 * v5.4s := (0,prk[0],prk[1],prk[2])
+	 * v6.4s := (0,0,prk[0],prk[1])
+	 * v7.4s := (0,0,0,prk[0])
+	 */
+	ext	v5.16b, v0.16b, v1.16b, #12
+	ext	v6.16b, v0.16b, v1.16b, #8
+	ext	v7.16b, v0.16b, v1.16b, #4
+
+	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
+	eor	v5.16b, v5.16b, v1.16b
+	eor	v5.16b, v5.16b, v3.16b
+	eor	v5.16b, v5.16b, v6.16b
+	eor	v5.16b, v5.16b, v7.16b
+
+	/*
+	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
+	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
+	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
+	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
+	 * (rklo[0],rklo[1],...).
+	 */
+
+	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
+	dup	v1.4s, v5.4s[3]
+	mov	v1.4s[0], v5.4s[2]
+
+	/*
+	 * v6.4s := (0, 0, rklo[0], rklo[1])
+	 * v7.4s := (0, 0, 0, rklo[0])
+	 */
+	ext	v6.16b, v0.16b, v2.16b, #8
+	ext	v7.16b, v0.16b, v2.16b, #4
+
+	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
+	eor	v3.16b, v1.16b, v6.16b
+	eor	v3.16b, v3.16b, v7.16b
+
+	/*
+	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
+	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
+	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
+	 */
+	mov	v2.2d[1], v5.2d[0]
+
+	/* store two round keys */
+	stp	q2, q3, [x0], #0x20
+
+	/*
+	 * Live vector registers at this point:
+	 *
+	 *	q0 = zero
+	 *	q2 = rk
+	 *	q3 = nrk
+	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
+	 *	q8 = unshiftrows_rotword_1
+	 *	q9 = unshiftrows_rotword_3
+	 *
+	 * We have to compute, in q1:
+	 *
+	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
+	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
+	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
+	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
+	 *     ^ nrk[1]
+	 *
+	 * And, if there's any more afterward, in q2:
+	 *
+	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
+	 *     ^ nrk[1] ^ nrk[2]
+	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
+	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
+	 */
+
+	/* q1 := RotWords(SubBytes(q3)) */
+	mov	v1.16b, v3.16b
+	aese	v1.16b, v0.16b
+
+	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
+	ld1r	{v4.4s}, [x3], #4
+	tbl	v1.16b, {v1.16b}, v9.16b
+	eor	v1.16b, v1.16b, v4.16b
+
+	/*
+	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
+	 * v4.4s := (0, rk[2], rk[3], nrk[0])
+	 * v6.4s := (0, 0, rk[2], rk[3])
+	 * v7.4s := (0, 0, 0, rk[2])
+	 */
+	ext	v4.16b, v0.16b, v5.16b, #12
+	ext	v6.16b, v0.16b, v5.16b, #8
+	ext	v7.16b, v0.16b, v5.16b, #4
+
+	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
+	eor	v1.16b, v1.16b, v5.16b
+	eor	v1.16b, v1.16b, v4.16b
+	eor	v1.16b, v1.16b, v6.16b
+	eor	v1.16b, v1.16b, v7.16b
+
+	subs	x2, x2, #3	/* count down three rounds */
+	str	q1, [x0], #0x10	/* store third round key */
+	b.eq	2f
+
+	/*
+	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
+	 * v5.4s := (0, nrk[2], xxx, xxx)
+	 */
+	ext	v4.16b, v3.16b, v0.16b, #8
+	ext	v5.16b, v0.16b, v4.16b, #12
+
+	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
+	dup	v2.4s, v1.4s[3]
+
+	/*
+	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
+	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
+	 *     xxx, xxx)
+	 */
+	eor	v2.16b, v2.16b, v4.16b
+	eor	v2.16b, v2.16b, v5.16b
+
+	b	1b
+
+2:	ret
+END(aesarmv8_setenckey192)
+
+/*
+ * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
+ *
+ *	Expand a 32-byte AES-256 key into 14 round keys.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_setenckey256)
+	/* q1 := key[0:128), q2 := key[128:256) */
+	ldp	q1, q2, [x1], #0x20
+
+	adrl	x4, unshiftrows_rotword_3
+	adrl	x5, unshiftrows_3
+	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
+	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 */
+	ldr	q9, [x5]	/* q9 := unshiftrows_3 */
+
+	/* store master key as first two round keys */
+	stp	q1, q2, [x0], #0x20
+	mov	x2, #14		/* round count */
+	adrl	x3, rcon	/* round constant */
+
+1:	/*
+	 * q0 = 0
+	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
+	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
+	 * x2 = round count
+	 * x3 = rcon pointer
+	 */
+
+	/* q3 := ShiftRows(SubBytes(q2)) */
+	mov	v3.16b, v2.16b
+	aese	v3.16b, v0.16b
+
+	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
+	ld1r	{v4.4s}, [x3], #4
+	tbl	v3.16b, {v3.16b}, v8.16b
+	eor	v3.16b, v3.16b, v4.16b
+
+	/*
+	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
+	 * v6.4s := (0,0,pprk[0],pprk[1])
+	 * v7.4s := (0,0,0,pprk[0])
+	 */
+	ext	v5.16b, v0.16b, v1.16b, #12
+	ext	v6.16b, v0.16b, v1.16b, #8
+	ext	v7.16b, v0.16b, v1.16b, #4
+
+	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
+	eor	v1.16b, v1.16b, v3.16b
+	eor	v1.16b, v1.16b, v5.16b
+	eor	v1.16b, v1.16b, v6.16b
+	eor	v1.16b, v1.16b, v7.16b
+
+	subs	x2, x2, #2		/* count down two rounds */
+	b.eq	2f			/* stop if this is the last one */
+
+	/* q3 := ShiftRows(SubBytes(q1)) */
+	mov	v3.16b, v1.16b
+	aese	v3.16b, v0.16b
+
+	/* v3.4s[i] := SubBytes(rk[3]) */
+	tbl	v3.16b, {v3.16b}, v9.16b
+
+	/*
+	 * v5.4s := (0,prk[0],prk[1],prk[2])
+	 * v6.4s := (0,0,prk[0],prk[1])
+	 * v7.4s := (0,0,0,prk[0])
+	 */
+	ext	v5.16b, v0.16b, v2.16b, #12
+	ext	v6.16b, v0.16b, v2.16b, #8
+	ext	v7.16b, v0.16b, v2.16b, #4
+
+	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
+	eor	v2.16b, v2.16b, v3.16b
+	eor	v2.16b, v2.16b, v5.16b
+	eor	v2.16b, v2.16b, v6.16b
+	eor	v2.16b, v2.16b, v7.16b
+
+	stp	q1, q2, [x0], #0x20	/* store two round keys */
+	b	1b
+
+2:	str	q1, [x0]		/* store last round key */
+	ret
+END(aesarmv8_setenckey256)
+
+/*
+ * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
+ *     uint32_t nrounds@x2)
+ *
+ *	Convert AES encryption round keys to AES decryption round keys.
+ *	`rounds' must be between 10 and 14.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_enctodec)
+	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
+1:	str	q0, [x1], #0x10	/* store round key */
+	subs	x2, x2, #1	/* count down round */
+	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
+	b.eq	2f		/* stop if this is the last one */
+	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
+	b	1b
+2:	str	q0, [x1]	/* store first round key verbatim */
+	ret
+END(aesarmv8_enctodec)
+
+/*
+ * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
+ *     uint8_t out[16] @x2, uint32_t nrounds@x3)
+ *
+ *	Encrypt a single block.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_enc)
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
+	mov	fp, sp
+	ldr	q0, [x1]	/* q0 := block */
+	bl	aesarmv8_enc1
+	str	q0, [x2]	/* store block */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
+	ret
+END(aesarmv8_enc)
+
+/*
+ * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
+ *     uint8_t out[16] @x2, uint32_t nrounds@x3)
+ *
+ *	Decrypt a single block.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_dec)
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
+	mov	fp, sp
+	ldr	q0, [x1]	/* q0 := block */
+	bl	aesarmv8_dec1
+	str	q0, [x2]	/* store block */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
+	ret
+END(aesarmv8_dec)
+
+/*
+ * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Encrypt a contiguous sequence of blocks with AES-CBC.
+ *
+ *	nbytes must be an integral multiple of 16.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_cbc_enc)
+	cbz	x3, 2f			/* stop if nothing to do */
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
+	mov	fp, sp
+	mov	x9, x0			/* x9 := enckey */
+	mov	x10, x3			/* x10 := nbytes */
+	ldr	q0, [x4]		/* q0 := chaining value */
+1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
+	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
+	mov	x0, x9			/* x0 := enckey */
+	mov	x3, x5			/* x3 := nrounds */
+	bl	aesarmv8_enc1		/* q0 := ciphertext block */
+	subs	x10, x10, #0x10		/* count down nbytes */
+	str	q0, [x2], #0x10		/* store ciphertext block */
+	b.ne	1b			/* repeat if x10 is nonzero */
+	str	q0, [x4]		/* store chaining value */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
+2:	ret
+END(aesarmv8_cbc_enc)
+
+/*
+ * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Decrypt a contiguous sequence of blocks with AES-CBC.
+ *
+ *	nbytes must be a positive integral multiple of 16.  This routine
+ *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_cbc_dec1)
+	stp	fp, lr, [sp, #-32]!	/* push stack frame with uint128 */
+	mov	fp, sp
+	ldr	q8, [x4]		/* q8 := iv */
+	str	q8, [sp, #16]		/* save iv */
+	mov	x9, x0			/* x9 := enckey */
+	mov	x10, x3			/* x10 := nbytes */
+	add	x1, x1, x3		/* x1 := pointer past end of in */
+	add	x2, x2, x3		/* x2 := pointer past end of out */
+	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
+	str	q0, [x4]		/* update iv */
+1:	mov	x0, x9			/* x0 := enckey */
+	mov	x3, x5			/* x3 := nrounds */
+	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3 */
+	subs	x10, x10, #0x10		/* count down nbytes */
+	b.eq	2f			/* stop if this is the first block */
+	ldr	q8, [x1, #-0x10]!	/* q8 := chaining value */
+	eor	v0.16b, v0.16b, v8.16b	/* q0 := plaintext block */
+	str	q0, [x2, #-0x10]!	/* store plaintext block */
+	mov	v0.16b, v8.16b		/* move cv = ciphertext block */
+	b	1b
+2:	ldr	q8, [sp, #16]		/* q8 := iv */
+	eor	v0.16b, v0.16b, v8.16b	/* q0 := first plaintext block */
+	str	q0, [x2, #-0x10]!	/* store first plaintext block */
+	ldp	fp, lr, [sp], #32	/* pop stack frame */
+	ret
+END(aesarmv8_cbc_dec1)
+
+/*
+ * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
+ *
+ *	nbytes must be a positive integral multiple of 128.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_cbc_dec8)
+	stp	fp, lr, [sp, #-32]!	/* push stack frame with uint128 */
+	mov	fp, sp
+	ldr	q8, [x4]		/* q8 := iv */
+	str	q8, [sp, #16]		/* save iv */
+	mov	x9, x0			/* x9 := enckey */
+	mov	x10, x3			/* x10 := nbytes */
+	add	x1, x1, x3		/* x1 := pointer past end of in */
+	add	x2, x2, x3		/* x2 := pointer past end of out */
+	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
+	str	q7, [x4]		/* update iv */
+1:	ldp	q4, q5, [x1, #-0x20]!
+	ldp	q2, q3, [x1, #-0x20]!
+	ldp	q0, q1, [x1, #-0x20]!
+	mov	v15.16b, v6.16b		/* q[8+i] := cv[i], 0<i<8 */
+	mov	v14.16b, v5.16b
+	mov	v13.16b, v4.16b
+	mov	v12.16b, v3.16b
+	mov	v11.16b, v2.16b
+	mov	v10.16b, v1.16b
+	mov	v9.16b, v0.16b
+	mov	x0, x9			/* x0 := enckey */
+	mov	x3, x5			/* x3 := nrounds */
+	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i] */
+	eor	v7.16b, v7.16b, v15.16b	/* q[i] := pt[i] */
+	eor	v6.16b, v6.16b, v14.16b
+	eor	v5.16b, v5.16b, v13.16b
+	eor	v4.16b, v4.16b, v12.16b
+	eor	v3.16b, v3.16b, v11.16b
+	eor	v2.16b, v2.16b, v10.16b
+	eor	v1.16b, v1.16b, v9.16b
+	subs	x10, x10, #0x80		/* count down nbytes */
+	stp	q6, q7, [x2, #-0x20]!	/* store plaintext blocks */
+	stp	q4, q5, [x2, #-0x20]!
+	stp	q2, q3, [x2, #-0x20]!
+	b.eq	2f			/* stop if this is the first block */
+	ldp	q6, q7, [x1, #-0x20]!
+	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
+	stp	q0, q1, [x2, #-0x20]!
+	b	1b
+2:	ldr	q8, [sp, #16]		/* q8 := iv */
+	eor	v0.16b, v0.16b, v8.16b	/* q0 := pt0 */
+	stp	q0, q1, [x2, #-0x20]!	/* store first two plaintext blocks */
+	ldp	fp, lr, [sp], #32	/* pop stack frame */
+	ret
+END(aesarmv8_cbc_dec8)
+
+/*
+ * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Encrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *	nbytes must be a positive integral multiple of 16.  This routine
+ *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_xts_enc1)
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
+	mov	fp, sp
+	mov	x9, x0			/* x9 := enckey */
+	mov	x10, x3			/* x10 := nbytes */
+	ldr	q9, [x4]		/* q9 := tweak */
+1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
+	mov	x0, x9			/* x0 := enckey */
+	mov	x3, x5			/* x3 := nrounds */
+	eor	v0.16b, v0.16b, v9.16b	/* q0 := ptxt ^ tweak */
+	bl	aesarmv8_enc1		/* q0 := AES(ptxt ^ tweak) */
+	eor	v0.16b, v0.16b, v9.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
+	str	q0, [x2], #0x10		/* store ciphertext block */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	subs	x10, x10, #0x10		/* count down nbytes */
+	b.ne	1b			/* repeat if more blocks */
+	str	q9, [x4]		/* update tweak */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
+	ret
+END(aesarmv8_xts_enc1)
+
+/*
+ * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Encrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *	nbytes must be a positive integral multiple of 128.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_xts_enc8)
+	stp	fp, lr, [sp, #-48]!	/* push stack frame uint128[2] */
+	mov	fp, sp
+	mov	x9, x0			/* x9 := enckey */
+	mov	x10, x3			/* x10 := nbytes */
+	ldr	q9, [x4]		/* q9 := tweak */
+1:	str	q9, [sp, #16]		/* save tweak[0] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	str	q9, [sp, #32]		/* save tweak[1] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v10.16b, v9.16b		/* q10 := tweak[2] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v11.16b, v9.16b		/* q11 := tweak[3] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v12.16b, v9.16b		/* q11 := tweak[4] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v13.16b, v9.16b		/* q11 := tweak[5] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v14.16b, v9.16b		/* q11 := tweak[6] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v15.16b, v9.16b		/* q11 := tweak[7] */
+	ldp	q8, q9, [sp, #16]	/* q8 := tweak[0], q9 := tweak[1] */
+	ldp	q0, q1, [x1], #0x20	/* q[i] := pt[i] */
+	ldp	q2, q3, [x1], #0x20
+	ldp	q4, q5, [x1], #0x20
+	ldp	q6, q7, [x1], #0x20
+	eor	v0.16b, v0.16b, v8.16b	/* q[i] := pt[i] ^ tweak[i] */
+	eor	v1.16b, v1.16b, v9.16b
+	eor	v2.16b, v2.16b, v10.16b
+	eor	v3.16b, v3.16b, v11.16b
+	eor	v4.16b, v4.16b, v12.16b
+	eor	v5.16b, v5.16b, v13.16b
+	eor	v6.16b, v6.16b, v14.16b
+	eor	v7.16b, v7.16b, v15.16b
+	mov	x0, x9			/* x0 := enckey */
+	mov	x3, x5			/* x3 := nrounds */
+	bl	aesarmv8_enc8		/* encrypt q0,...,q7; trash x0/x3/q8 */
+	ldr	q8, [sp, #16]		/* reload q8 := tweak[0] */
+	eor	v1.16b, v1.16b, v9.16b	/* q[i] := AES(...) ^ tweak[i] */
+	eor	v2.16b, v2.16b, v10.16b
+	eor	v3.16b, v3.16b, v11.16b
+	eor	v0.16b, v0.16b, v8.16b
+	eor	v4.16b, v4.16b, v12.16b
+	eor	v5.16b, v5.16b, v13.16b
+	eor	v6.16b, v6.16b, v14.16b
+	eor	v7.16b, v7.16b, v15.16b
+	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
+	stp	q2, q3, [x2], #0x20	/* store ciphertext blocks */
+	stp	q4, q5, [x2], #0x20	/* store ciphertext blocks */
+	stp	q6, q7, [x2], #0x20	/* store ciphertext blocks */
+	mov	v9.16b, v15.16b		/* q9 := q15 = tweak[7] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	subs	x10, x10, #0x80		/* count down nbytes */
+	b.ne	1b			/* repeat if more block groups */
+	str	q9, [x4]		/* update tweak */
+	ldp	fp, lr, [sp], #48	/* pop stack frame */
+	ret
+END(aesarmv8_xts_enc8)
+
+/*
+ * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Decrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *	nbytes must be a positive integral multiple of 16.  This routine
+ *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_xts_dec1)
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
+	mov	fp, sp
+	mov	x9, x0			/* x9 := deckey */
+	mov	x10, x3			/* x10 := nbytes */
+	ldr	q9, [x4]		/* q9 := tweak */
+1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
+	mov	x0, x9			/* x0 := deckey */
+	mov	x3, x5			/* x3 := nrounds */
+	eor	v0.16b, v0.16b, v9.16b	/* q0 := ptxt ^ tweak */
+	bl	aesarmv8_dec1		/* q0 := AES(ptxt ^ tweak) */
+	eor	v0.16b, v0.16b, v9.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
+	str	q0, [x2], #0x10		/* store ciphertext block */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	subs	x10, x10, #0x10		/* count down nbytes */
+	b.ne	1b			/* repeat if more blocks */
+	str	q9, [x4]		/* update tweak */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
+	ret
+END(aesarmv8_xts_dec1)
+
+/*
+ * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *	Decrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *	nbytes must be a positive integral multiple of 128.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_xts_dec8)
+	stp	fp, lr, [sp, #-48]!	/* push stack frame uint128[2] */
+	mov	fp, sp
+	mov	x9, x0			/* x9 := deckey */
+	mov	x10, x3			/* x10 := nbytes */
+	ldr	q9, [x4]		/* q9 := tweak */
+1:	str	q9, [sp, #16]		/* save tweak[0] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	str	q9, [sp, #32]		/* save tweak[1] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v10.16b, v9.16b		/* q10 := tweak[2] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v11.16b, v9.16b		/* q11 := tweak[3] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v12.16b, v9.16b		/* q11 := tweak[4] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v13.16b, v9.16b		/* q11 := tweak[5] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v14.16b, v9.16b		/* q11 := tweak[6] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	mov	v15.16b, v9.16b		/* q11 := tweak[7] */
+	ldp	q8, q9, [sp, #16]	/* q8 := tweak[0], q9 := tweak[1] */
+	ldp	q0, q1, [x1], #0x20	/* q[i] := pt[i] */
+	ldp	q2, q3, [x1], #0x20
+	ldp	q4, q5, [x1], #0x20
+	ldp	q6, q7, [x1], #0x20
+	eor	v0.16b, v0.16b, v8.16b	/* q[i] := pt[i] ^ tweak[i] */
+	eor	v1.16b, v1.16b, v9.16b
+	eor	v2.16b, v2.16b, v10.16b
+	eor	v3.16b, v3.16b, v11.16b
+	eor	v4.16b, v4.16b, v12.16b
+	eor	v5.16b, v5.16b, v13.16b
+	eor	v6.16b, v6.16b, v14.16b
+	eor	v7.16b, v7.16b, v15.16b
+	mov	x0, x9			/* x0 := deckey */
+	mov	x3, x5			/* x3 := nrounds */
+	bl	aesarmv8_dec8		/* decrypt q0,...,q7; trash x0/x3/q8 */
+	ldr	q8, [sp, #16]		/* reload q8 := tweak[0] */
+	eor	v1.16b, v1.16b, v9.16b	/* q[i] := AES(...) ^ tweak[i] */
+	eor	v2.16b, v2.16b, v10.16b
+	eor	v3.16b, v3.16b, v11.16b
+	eor	v0.16b, v0.16b, v8.16b
+	eor	v4.16b, v4.16b, v12.16b
+	eor	v5.16b, v5.16b, v13.16b
+	eor	v6.16b, v6.16b, v14.16b
+	eor	v7.16b, v7.16b, v15.16b
+	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
+	stp	q2, q3, [x2], #0x20	/* store ciphertext blocks */
+	stp	q4, q5, [x2], #0x20	/* store ciphertext blocks */
+	stp	q6, q7, [x2], #0x20	/* store ciphertext blocks */
+	mov	v9.16b, v15.16b		/* q9 := q15 = tweak[7] */
+	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	subs	x10, x10, #0x80		/* count down nbytes */
+	b.ne	1b			/* repeat if more block groups */
+	str	q9, [x4]		/* update tweak */
+	ldp	fp, lr, [sp], #48	/* pop stack frame */
+	ret
+END(aesarmv8_xts_dec8)
+
+/*
+ * aesarmv8_xts_mulx(tweak@q9)
+ *
+ *	Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
+ *	Uses x0 and q0/q1 as temporaries.
+ */
+	.text
+	_ALIGN_TEXT
+	.type	aesarmv8_xts_mulx,@function
+aesarmv8_xts_mulx:
+	/*
+	 * Simultaneously determine
+	 * (a) whether the high bit of the low half must be
+	 *     shifted into the low bit of the high half, and
+	 * (b) whether the high bit of the high half must be
+	 *     carried into x^128 = x^7 + x^2 + x + 1.
+	 */
+	adrl	x0, xtscarry
+	cmlt	v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
+	ldr	q0, [x0]		/* q0 := xtscarry */
+	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
+	shl	v9.2d, v9.2d, #1	/* shift */
+	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
+	eor	v9.16b, v9.16b, v0.16b	/* incorporate (a) and (b) */
+	ret
+END(aesarmv8_xts_mulx)
+
+	.section .rodata
+	.align	16
+	.type	xtscarry,@object
+xtscarry:
+	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
+END(xtscarry)
+
+/*
+ * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
+ *
+ *	Update an AES-XTS tweak.
+ *
+ *	Standard ABI calling convention.
+ */
+ENTRY(aesarmv8_xts_update)
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
+	mov	fp, sp
+	ldr	q9, [x0]		/* load tweak */
+	bl	aesarmv8_xts_mulx	/* q9 *= x */
+	str	q9, [x1]		/* store tweak */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
+	ret
+END(aesarmv8_xts_update)
+
+/*
+ * aesarmv8_enc1(const struct aesenc *enckey@x0,
+ *     uint128_t block@q0, uint32_t nrounds@x3)
+ *
+ *	Encrypt a single AES block in q0.
+ *
+ *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ */
+	.text
+	_ALIGN_TEXT
+	.type	aesarmv8_enc1,@function
+aesarmv8_enc1:
+	ldr	q8, [x0], #0x10		/* load round key */
+1:	subs	x3, x3, #1
+	/* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */
+	aese	v0.16b, v8.16b
+	ldr	q8, [x0], #0x10		/* load next round key */
+	b.eq	2f
+	/* q0 := MixColumns(q0) */
+	aesmc	v0.16b, v0.16b
+	b	1b
+2:	eor	v0.16b, v0.16b, v8.16b
+	ret
+END(aesarmv8_enc1)
+
+/*
+ * aesarmv8_enc8(const struct aesenc *enckey@x0,
+ *     uint128_t block0@q0, ..., uint128_t block7@q7,
+ *     uint32_t nrounds@x3)
+ *
+ *	Encrypt eight AES blocks in q0 through q7 in parallel.
+ *
+ *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ */
+	.text
+	_ALIGN_TEXT
+	.type	aesarmv8_enc8,@function
+aesarmv8_enc8:
+	ldr	q8, [x0], #0x10		/* load round key */
+1:	subs	x3, x3, #1
+	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
+	aese	v0.16b, v8.16b
+	aese	v1.16b, v8.16b
+	aese	v2.16b, v8.16b
+	aese	v3.16b, v8.16b
+	aese	v4.16b, v8.16b
+	aese	v5.16b, v8.16b
+	aese	v6.16b, v8.16b
+	aese	v7.16b, v8.16b
+	ldr	q8, [x0], #0x10		/* load next round key */
+	b.eq	2f
+	/* q[i] := MixColumns(q[i]) */
+	aesmc	v0.16b, v0.16b
+	aesmc	v1.16b, v1.16b
+	aesmc	v2.16b, v2.16b
+	aesmc	v3.16b, v3.16b
+	aesmc	v4.16b, v4.16b
+	aesmc	v5.16b, v5.16b
+	aesmc	v6.16b, v6.16b
+	aesmc	v7.16b, v7.16b
+	b	1b
+2:	eor	v0.16b, v0.16b, v8.16b	/* AddRoundKey */
+	eor	v1.16b, v1.16b, v8.16b
+	eor	v2.16b, v2.16b, v8.16b
+	eor	v3.16b, v3.16b, v8.16b
+	eor	v4.16b, v4.16b, v8.16b
+	eor	v5.16b, v5.16b, v8.16b
+	eor	v6.16b, v6.16b, v8.16b
+	eor	v7.16b, v7.16b, v8.16b
+	ret
+END(aesarmv8_enc8)
+
+/*
+ * aesarmv8_dec1(const struct aesdec *deckey@x0,
+ *     uint128_t block@q0, uint32_t nrounds@x3)
+ *
+ *	Decrypt a single AES block in q0.
+ *
+ *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ */
+	.text
+	_ALIGN_TEXT
+	.type	aesarmv8_dec1,@function
+aesarmv8_dec1:
+	ldr	q8, [x0], #0x10		/* load round key */
+1:	subs	x3, x3, #1
+	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */
+	aesd	v0.16b, v8.16b
+	ldr	q8, [x0], #0x10		/* load next round key */
+	b.eq	2f
+	/* q0 := InMixColumns(q0) */
+	aesimc	v0.16b, v0.16b
+	b	1b
+2:	eor	v0.16b, v0.16b, v8.16b
+	ret
+END(aesarmv8_dec1)
+
+/*
+ * aesarmv8_dec8(const struct aesdec *deckey@x0,
+ *     uint128_t block0@q0, ..., uint128_t block7@q7,
+ *     uint32_t nrounds@x3)
+ *
+ *	Decrypt eight AES blocks in q0 through q7 in parallel.
+ *
+ *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ */
+	.text
+	_ALIGN_TEXT
+	.type	aesarmv8_dec8,@function
+aesarmv8_dec8:
+	ldr	q8, [x0], #0x10		/* load round key */
+1:	subs	x3, x3, #1
+	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
+	aesd	v0.16b, v8.16b
+	aesd	v1.16b, v8.16b
+	aesd	v2.16b, v8.16b
+	aesd	v3.16b, v8.16b
+	aesd	v4.16b, v8.16b
+	aesd	v5.16b, v8.16b
+	aesd	v6.16b, v8.16b
+	aesd	v7.16b, v8.16b
+	ldr	q8, [x0], #0x10		/* load next round key */
+	b.eq	2f
+	/* q[i] := InMixColumns(q[i]) */
+	aesimc	v0.16b, v0.16b
+	aesimc	v1.16b, v1.16b
+	aesimc	v2.16b, v2.16b
+	aesimc	v3.16b, v3.16b
+	aesimc	v4.16b, v4.16b
+	aesimc	v5.16b, v5.16b
+	aesimc	v6.16b, v6.16b
+	aesimc	v7.16b, v7.16b
+	b	1b
+2:	eor	v0.16b, v0.16b, v8.16b	/* AddRoundKey */
+	eor	v1.16b, v1.16b, v8.16b
+	eor	v2.16b, v2.16b, v8.16b
+	eor	v3.16b, v3.16b, v8.16b
+	eor	v4.16b, v4.16b, v8.16b
+	eor	v5.16b, v5.16b, v8.16b
+	eor	v6.16b, v6.16b, v8.16b
+	eor	v7.16b, v7.16b, v8.16b
+	ret
+END(aesarmv8_dec8)
Index: src/sys/crypto/aes/arch/arm/files.aesarmv8
diff -u /dev/null src/sys/crypto/aes/arch/arm/files.aesarmv8:1.1
--- /dev/null	Mon Jun 29 23:31:42 2020
+++ src/sys/crypto/aes/arch/arm/files.aesarmv8	Mon Jun 29 23:31:41 2020
@@ -0,0 +1,4 @@
+#	$NetBSD: files.aesarmv8,v 1.1 2020/06/29 23:31:41 riastradh Exp $
+
+file	crypto/aes/arch/arm/aes_armv8.c		aes
+file	crypto/aes/arch/arm/aes_armv8_64.S	aes

Reply via email to