Module Name:    src
Committed By:   riastradh
Date:           Mon Jun 29 23:57:56 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_neon.c files.aesneon
Added Files:
        src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
Provide hand-written AES NEON assembly for arm32.

gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32;
hand-writing it made it about 12x faster, by avoiding a zillion loads
and stores to spill everything and the kitchen sink onto the stack.
(But gcc does fine on aarch64, presumably because it has twice as
many registers and doesn't have to deal with q2=d4/d5 overlapping.)


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon.c \
    src/sys/crypto/aes/arch/arm/files.aesneon
cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.1 src/sys/crypto/aes/arch/arm/aes_neon.c:1.2
--- src/sys/crypto/aes/arch/arm/aes_neon.c:1.1	Mon Jun 29 23:56:31 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.c	Mon Jun 29 23:57:56 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $");
 
 #include <sys/types.h>
 
@@ -47,6 +47,12 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v
 
 #include "aes_neon_impl.h"
 
+#ifdef __aarch64__
+#define	__aarch64_used
+#else
+#define	__aarch64_used	__unused
+#endif
+
 static const uint8x16_t
 mc_forward[4] = {
 	{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
@@ -58,7 +64,7 @@ mc_forward[4] = {
 	{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
 	 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
 },
-mc_backward[4] = {
+mc_backward[4] __aarch64_used = {
 	{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
 	 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
 	{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
@@ -68,7 +74,7 @@ mc_backward[4] = {
 	{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
 	 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
 },
-ipt[2] = {
+ipt[2] __aarch64_used = {
 	{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
 	 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
 	{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
@@ -80,55 +86,55 @@ opt[2] = {
 	{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
 	 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
 },
-dipt[2] = {
+dipt[2] __aarch64_used = {
 	{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
 	 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
 	{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
 	 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
 },
-sb1[2] = {
+sb1[2] __aarch64_used = {
 	{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
 	 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
 	{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
 	 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
 },
-sb2[2] = {
+sb2[2] __aarch64_used = {
 	{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
 	 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
 	{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
 	 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
 },
-sbo[2] = {
+sbo[2] __aarch64_used = {
 	{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
 	 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
 	{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
 	 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
 },
-dsb9[2] = {
+dsb9[2] __aarch64_used = {
 	{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
 	 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
 	{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
 	 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
 },
-dsbd[2] = {
+dsbd[2] __aarch64_used = {
 	{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
 	 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
 	{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
 	 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
 },
-dsbb[2] = {
+dsbb[2] __aarch64_used = {
 	{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
 	 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
 	{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
 	 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
 },
-dsbe[2] = {
+dsbe[2] __aarch64_used = {
 	{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
 	 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
 	{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
 	 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
 },
-dsbo[2] = {
+dsbo[2] __aarch64_used = {
 	{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
 	 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
 	{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
@@ -164,7 +170,7 @@ deskew[2] = {
 	{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
 	 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
 },
-sr[4] = {
+sr[4] __aarch64_used = {
 	{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 	 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
 	{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
@@ -533,6 +539,14 @@ aes_neon_setdeckey(struct aesdec *dec, c
 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
 }
 
+#ifdef __aarch64__
+
+/*
+ * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
+ * do the performance-critical parts -- encryption and decryption -- in
+ * hand-written assembly on arm32.
+ */
+
 uint8x16_t
 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
 {
@@ -608,3 +622,5 @@ aes_neon_dec1(const struct aesdec *dec, 
 	x ^= loadroundkey(rk32);
 	return vqtbl1q_u8(x, sr[i]);
 }
+
+#endif
Index: src/sys/crypto/aes/arch/arm/files.aesneon
diff -u src/sys/crypto/aes/arch/arm/files.aesneon:1.1 src/sys/crypto/aes/arch/arm/files.aesneon:1.2
--- src/sys/crypto/aes/arch/arm/files.aesneon:1.1	Mon Jun 29 23:56:31 2020
+++ src/sys/crypto/aes/arch/arm/files.aesneon	Mon Jun 29 23:57:56 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.aesneon,v 1.1 2020/06/29 23:56:31 riastradh Exp $
+#	$NetBSD: files.aesneon,v 1.2 2020/06/29 23:57:56 riastradh Exp $
 
 ifdef aarch64
 makeoptions	aes	"COPTS.aes_neon.c"+="-march=armv8-a"
@@ -11,3 +11,7 @@ endif
 file	crypto/aes/arch/arm/aes_neon.c		aes
 file	crypto/aes/arch/arm/aes_neon_impl.c	aes
 file	crypto/aes/arch/arm/aes_neon_subr.c	aes
+
+ifndef aarch64
+file	crypto/aes/arch/arm/aes_neon_32.S	aes
+endif

Added files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u /dev/null src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.1
--- /dev/null	Mon Jun 29 23:57:56 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Mon Jun 29 23:57:56 2020
@@ -0,0 +1,653 @@
+/*	$NetBSD: aes_neon_32.S,v 1.1 2020/06/29 23:57:56 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arm/asm.h>
+
+	.fpu	neon
+
+	.section .rodata
+	.p2align 4
+
+	.type	inv,_ASM_TYPE_OBJECT
+inv:
+	.byte	0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
+	.byte	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
+END(inv)
+
+	.type	inva,_ASM_TYPE_OBJECT
+inva:
+	.byte	0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
+	.byte	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
+END(inva)
+
+	.type	mc_forward,_ASM_TYPE_OBJECT
+mc_forward:
+	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04	/* 0 */
+	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
+
+	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08	/* 1 */
+	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
+
+	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C	/* 2 */
+	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
+
+.Lmc_forward_3:
+	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00	/* 3 */
+	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
+END(mc_forward)
+
+	.type	mc_backward,_ASM_TYPE_OBJECT
+mc_backward:
+	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06	/* 0 */
+	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
+
+	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02	/* 1 */
+	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
+
+	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E	/* 2 */
+	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
+
+	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A	/* 3 */
+	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
+END(mc_backward)
+
+	.type	sr,_ASM_TYPE_OBJECT
+sr:
+	.byte	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07	/* 0 */
+	.byte	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
+
+	.byte	0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03	/* 1 */
+	.byte	0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
+
+	.byte	0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F	/* 2 */
+	.byte	0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
+
+	.byte	0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B	/* 3 */
+	.byte	0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
+END(sr)
+
+	.type	iptlo,_ASM_TYPE_OBJECT
+iptlo:
+	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+	.byte	0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
+END(iptlo)
+
+	.type	ipthi,_ASM_TYPE_OBJECT
+ipthi:
+	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+	.byte	0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
+END(ipthi)
+
+	.type	sb1_0,_ASM_TYPE_OBJECT
+sb1_0:
+	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+	.byte	0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
+END(sb1_0)
+
+	.type	sb1_1,_ASM_TYPE_OBJECT
+sb1_1:
+	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+	.byte	0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
+END(sb1_1)
+
+	.type	sb2_0,_ASM_TYPE_OBJECT
+sb2_0:
+	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
+	.byte	0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
+END(sb2_0)
+
+	.type	sb2_1,_ASM_TYPE_OBJECT
+sb2_1:
+	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+	.byte	0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
+END(sb2_1)
+
+	.type	sbo_0,_ASM_TYPE_OBJECT
+sbo_0:
+	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
+	.byte	0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
+END(sbo_0)
+
+	.type	sbo_1,_ASM_TYPE_OBJECT
+sbo_1:
+	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+	.byte	0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
+END(sbo_1)
+
+	.type	diptlo,_ASM_TYPE_OBJECT
+diptlo:
+	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+	.byte	0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
+END(diptlo)
+
+	.type	dipthi,_ASM_TYPE_OBJECT
+dipthi:
+	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+	.byte	0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
+END(dipthi)
+
+	.type	dsb9_0,_ASM_TYPE_OBJECT
+dsb9_0:
+	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
+	.byte	0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
+END(dsb9_0)
+
+	.type	dsb9_1,_ASM_TYPE_OBJECT
+dsb9_1:
+	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
+	.byte	0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
+END(dsb9_1)
+
+	.type	dsbd_0,_ASM_TYPE_OBJECT
+dsbd_0:
+	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
+	.byte	0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
+END(dsbd_0)
+
+	.type	dsbd_1,_ASM_TYPE_OBJECT
+dsbd_1:
+	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
+	.byte	0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
+END(dsbd_1)
+
+	.type	dsbb_0,_ASM_TYPE_OBJECT
+dsbb_0:
+	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
+	.byte	0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
+END(dsbb_0)
+
+	.type	dsbb_1,_ASM_TYPE_OBJECT
+dsbb_1:
+	.byte	0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
+	.byte	0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
+END(dsbb_1)
+
+	.type	dsbe_0,_ASM_TYPE_OBJECT
+dsbe_0:
+	.byte	0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
+	.byte	0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
+END(dsbe_0)
+
+	.type	dsbe_1,_ASM_TYPE_OBJECT
+dsbe_1:
+	.byte	0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
+	.byte	0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
+END(dsbe_1)
+
+	.type	dsbo_0,_ASM_TYPE_OBJECT
+dsbo_0:
+	.byte	0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
+	.byte	0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
+END(dsbo_0)
+
+	.type	dsbo_1,_ASM_TYPE_OBJECT
+dsbo_1:
+	.byte	0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
+	.byte	0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
+END(dsbo_1)
+
+/*
+ * aes_neon_enc1(enc, x, nrounds)
+ *
+ *	With -mfloat-abi=hard:
+ *
+ * uint8x16_t@q0
+ * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
+ *     unsigned nrounds@r1)
+ *
+ *	With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
+ *
+ * uint8x16_t@(r0,r1,r2,r3)
+ * aes_neon_enc1(const struct aesenc *enc@r0,
+ *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
+ */
+ENTRY(aes_neon_enc1)
+#ifdef _KERNEL
+	vmov	d0, r2, r3		/* d0 := x lo */
+	vldr	d1, [sp]		/* d1 := x hi */
+	ldr	r1, [sp, #8]		/* r1 := nrounds */
+#endif
+	push	{r4, r5, r6, r7, r8, r10, r11, lr}
+	vpush	{d8-d15}
+
+	/*
+	 * r3: rmod4
+	 * r4: mc_forward
+	 * r5: mc_backward
+	 * r6,r7,r8,r10,r11: temporaries
+	 * q0={d0-d1}: x/ak/A
+	 * q1={d2-d3}: 0x0f0f...
+	 * q2={d4-d5}: lo/k/j/io
+	 * q3={d6-d7}: hi/i/jo
+	 * q4={d8-d9}: iptlo
+	 * q5={d10-d11}: ipthi
+	 * q6={d12-d13}: sb1[0]/sbo[0]
+	 * q7={d14-d15}: sb1[1]/sbo[1]
+	 * q8={d16-d17}: sb2[0]
+	 * q9={d18-d19}: sb2[1]
+	 * q10={d20-d21}: inv
+	 * q11={d22-d23}: inva
+	 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
+	 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
+	 * q14={d28-d29}: rk/A2/A2_B_D
+	 * q15={d30-d31}: A2_B/sr[rmod4]
+	 */
+
+	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	movw	r3, #0
+	vmov.i8	q1, #0x0f
+
+	/* (q4, q5) := (iptlo, ipthi) */
+	ldr	r6, =iptlo
+	ldr	r7, =ipthi
+	vld1.64	{d8-d9}, [r6 :128]
+	vld1.64	{d10-d11}, [r7 :128]
+
+	/* load the rest of the constants */
+	ldr	r4, =sb1_0
+	ldr	r5, =sb1_1
+	ldr	r6, =sb2_0
+	ldr	r7, =sb2_1
+	ldr	r8, =inv
+	ldr	r10, =inva
+	vld1.64	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
+	vld1.64	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
+	vld1.64	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
+	vld1.64	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
+	vld1.64	{d20-d21}, [r8 :128]	/* q10 = inv */
+	vld1.64	{d22-d23}, [r10 :128]	/* q11 = inva */
+
+	/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
+	ldr	r4, =mc_forward
+	ldr	r5, =mc_backward
+
+	/* (q2, q3) := (lo, hi) */
+	vshr.u8	q3, q0, #4
+	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
+	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
+
+	/* (q2, q3) := (iptlo(lo), ipthi(hi)) */
+	vtbl.8	d4, {d8-d9}, d4
+	vtbl.8	d5, {d8-d9}, d5
+	vtbl.8	d6, {d10-d11}, d6
+	vtbl.8	d7, {d10-d11}, d7
+
+	/* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
+	veor	q0, q14, q2
+	veor	q0, q0, q3
+
+	b	2f
+
+1:	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+
+	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
+	vtbl.8	d24, {d12-d13}, d4
+	vtbl.8	d25, {d12-d13}, d5
+	vtbl.8	d26, {d14-d15}, d6
+	vtbl.8	d27, {d14-d15}, d7
+	veor	q0, q14, q12
+	veor	q0, q0, q13
+
+	/* q14 := A2 = sb2_0[io] + sb2_1[jo] */
+	vtbl.8	d24, {d16-d17}, d4
+	vtbl.8	d25, {d16-d17}, d5
+	vtbl.8	d26, {d18-d19}, d6
+	vtbl.8	d27, {d18-d19}, d7
+	veor	q14, q12, q13
+
+	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
+	add	r6, r4, r3, lsl #4
+	add	r7, r5, r3, lsl #4
+	vld1.64	{d24-d25}, [r6]
+	vld1.64	{d26-d27}, [r7]
+
+	/* q15 := A2_B = A2 + A(mcf) */
+	vtbl.8	d30, {d0-d1}, d24
+	vtbl.8	d31, {d0-d1}, d25
+	veor	q15, q15, q14
+
+	/* q14 := A2_B_D = A2_B + A(mcb) */
+	vtbl.8	d28, {d0-d1}, d26
+	vtbl.8	d29, {d0-d1}, d27
+	veor	q14, q14, q15
+
+	/* q0 := x = A2_B_D + A2_B(mcf) */
+	vtbl.8	d0, {d30-d31}, d24
+	vtbl.8	d1, {d30-d31}, d25
+	veor	q0, q0, q14
+
+2:	/*
+	 * SubBytes
+	 */
+
+	/* (q2, q3) := (k, i) */
+	vshr.u8	q3, q0, #4
+	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
+	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
+
+	/* q0 := a/k */
+	vtbl.8	d0, {d22-d23}, d4
+	vtbl.8	d1, {d22-d23}, d5
+
+	/* q2 := j = i + k */
+	veor	q2, q3, q2
+
+	/* q12 := ir = 1/i */
+	vtbl.8	d24, {d20-d21}, d6
+	vtbl.8	d25, {d20-d21}, d7
+
+	/* q13 := jr = 1/j */
+	vtbl.8	d26, {d20-d21}, d4
+	vtbl.8	d27, {d20-d21}, d5
+
+	/* q12 := iak = 1/i + a/k */
+	veor	q12, q12, q0
+
+	/* q13 := jak = 1/j + a/k */
+	veor	q13, q13, q0
+
+	/* q12 := iakr = 1/(1/i + a/k) */
+	vtbl.8	d24, {d20-d21}, d24
+	vtbl.8	d25, {d20-d21}, d25
+
+	/* q13 := jakr = 1/(1/j + a/k) */
+	vtbl.8	d26, {d20-d21}, d26
+	vtbl.8	d27, {d20-d21}, d27
+
+	/* q2 := io = j + 1/(1/i + a/k) */
+	veor	q2, q2, q12
+
+	/* q3 := jo = i + 1/(1/j + a/k) */
+	veor	q3, q3, q13
+
+	/* advance round */
+	add	r3, r3, #1
+	subs	r1, r1, #1
+	and	r3, r3, #3
+	bne	1b
+
+	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
+	ldr	r8, =sr
+	ldr	r6, =sbo_0
+	ldr	r7, =sbo_1
+	add	r8, r8, r3, lsl #4
+	vld1.64	{d12-d13}, [r6 :128]
+	vld1.64	{d14-d15}, [r7 :128]
+	vld1.64	{d30-d31}, [r8 :128]
+
+	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+
+	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
+	vtbl.8	d4, {d12-d13}, d4
+	vtbl.8	d5, {d12-d13}, d5
+	vtbl.8	d6, {d14-d15}, d6
+	vtbl.8	d7, {d14-d15}, d7
+
+	/* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
+	veor	q2, q2, q14
+	veor	q2, q2, q3
+
+	/* q0 := x(sr[rmod4]) */
+	vtbl.8	d0, {d4-d5}, d30
+	vtbl.8	d1, {d4-d5}, d31
+
+	vpop	{d8-d15}
+	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
+#ifdef _KERNEL
+	vmov	r0, r1, d0
+	vmov	r2, r3, d1
+#endif
+	bx	lr
+END(aes_neon_enc1)
+
+/*
+ * aes_neon_dec1(dec, x, nrounds)
+ *
+ *	With -mfloat-abi=hard:
+ *
+ * uint8x16_t@q0
+ * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
+ *     unsigned nrounds@r1)
+ *
+ *	With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
+ *
+ * uint8x16_t@(r0,r1,r2,r3)
+ * aes_neon_dec1(const struct aesdec *dec@r0,
+ *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
+ */
+ENTRY(aes_neon_dec1)
+#ifdef _KERNEL
+	vmov	d0, r2, r3		/* d0 := x lo */
+	vldr	d1, [sp]		/* d1 := x hi */
+	ldr	r1, [sp, #8]		/* r1 := nrounds */
+#endif
+	push	{r4, r5, r6, r7, r8, r10, r11, lr}
+	vpush	{d8-d15}
+
+	/*
+	 * r3: 3 & ~(nrounds - 1)
+	 * q0={d0-d1}: x/ak
+	 * q1={d2-d3}: 0x0f0f...
+	 * q2={d4-d5}: lo/k/j/io
+	 * q3={d6-d7}: hi/i/jo
+	 * q4={d8-d9}: diptlo/dsb9[0]
+	 * q5={d10-d11}: dipthi/dsb9[1]
+	 * q6={d12-d13}: dsbb[0]/dsbo[0]
+	 * q7={d14-d15}: dsbb[1]/dsbo[1]
+	 * q8={d16-d17}: dsbd[0]/dsbe[0]
+	 * q9={d18-d19}: dsbd[1]/dsbe[0]
+	 * q10={d20-d21}: inv
+	 * q11={d22-d23}: inva
+	 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
+	 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
+	 * q14={d28-d29}: rk/xmc
+	 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
+	 */
+
+	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
+	vmov.i8	q1, #0x0f
+	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
+
+	/* (q4, q5) := (diptlo, dipthi) */
+	ldr	r6, =diptlo
+	ldr	r7, =dipthi
+	vld1.64	{d8-d9}, [r6 :128]
+	vld1.64	{d10-d11}, [r7 :128]
+
+	/* load the rest of the constants */
+	ldr	r4, =dsbb_0
+	ldr	r5, =dsbb_1
+	ldr	r6, =inv
+	ldr	r7, =inva
+	ldr	r8, =.Lmc_forward_3
+	vld1.64	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
+	vld1.64	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
+	vld1.64	{d20-d21}, [r6 :128]	/* q10 := inv */
+	vld1.64	{d22-d23}, [r7 :128]	/* q11 := inva */
+	vld1.64	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
+
+	/* (q2, q3) := (lo, hi) */
+	vshr.u8	q3, q0, #4
+	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
+	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
+
+	/* (q2, q3) := (diptlo(lo), dipthi(hi)) */
+	vtbl.8	d4, {d8-d9}, d4
+	vtbl.8	d5, {d8-d9}, d5
+	vtbl.8	d6, {d10-d11}, d6
+	vtbl.8	d7, {d10-d11}, d7
+
+	/* load dsb9 */
+	ldr	r4, =dsb9_0
+	ldr	r5, =dsb9_1
+	vld1.64	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
+	vld1.64	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
+
+	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
+	veor	q0, q14, q2
+	veor	q0, q0, q3
+
+	b	2f
+
+1:	/* load dsbd */
+	ldr	r4, =dsbd_0
+	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
+	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
+
+	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+
+	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
+	vtbl.8	d24, {d8-d9}, d4
+	vtbl.8	d25, {d8-d9}, d5
+	vtbl.8	d26, {d10-d11}, d6
+	vtbl.8	d27, {d10-d11}, d7
+	veor	q0, q14, q12
+	veor	q0, q0, q13
+
+	/* q14 := x(mc) */
+	vtbl.8	d28, {d0-d1}, d30
+	vtbl.8	d29, {d0-d1}, d31
+
+	/* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
+	vtbl.8	d24, {d16-d17}, d4
+	vtbl.8	d25, {d16-d17}, d5
+	vtbl.8	d26, {d18-d19}, d6
+	vtbl.8	d27, {d18-d19}, d7
+	veor	q0, q14, q12
+	veor	q0, q0, q13
+
+	/* load dsbe */
+	ldr	r4, =dsbe_0
+	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
+	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
+
+	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
+	vtbl.8	d28, {d0-d1}, d30
+	vtbl.8	d29, {d0-d1}, d31
+	vtbl.8	d24, {d12-d13}, d4
+	vtbl.8	d25, {d12-d13}, d5
+	vtbl.8	d26, {d14-d15}, d6
+	vtbl.8	d27, {d14-d15}, d7
+	veor	q0, q14, q12
+	veor	q0, q0, q13
+
+	/* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
+	vtbl.8	d28, {d0-d1}, d30
+	vtbl.8	d29, {d0-d1}, d31
+	vtbl.8	d24, {d16-d17}, d4
+	vtbl.8	d25, {d16-d17}, d5
+	vtbl.8	d26, {d18-d19}, d6
+	vtbl.8	d27, {d18-d19}, d7
+	veor	q0, q14, q12
+	veor	q0, q0, q13
+
+	/* q15 := mc := mc <<< 12*8 */
+	vext.8	q15, q15, q15, #12
+
+2:	/*
+	 * SubBytes
+	 */
+
+	/* (q2, q3) := (k, i) */
+	vshr.u8	q3, q0, #4
+	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
+	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
+
+	/* q0 := a/k */
+	vtbl.8	d0, {d22-d23}, d4
+	vtbl.8	d1, {d22-d23}, d5
+
+	/* q2 := j = i + k */
+	veor	q2, q3, q2
+
+	/* q12 := ir = 1/i */
+	vtbl.8	d24, {d20-d21}, d6
+	vtbl.8	d25, {d20-d21}, d7
+
+	/* q13 := jr = 1/j */
+	vtbl.8	d26, {d20-d21}, d4
+	vtbl.8	d27, {d20-d21}, d5
+
+	/* q12 := iak = 1/i + a/k */
+	veor	q12, q12, q0
+
+	/* q13 := jak = 1/j + a/k */
+	veor	q13, q13, q0
+
+	/* q12 := iakr = 1/(1/i + a/k) */
+	vtbl.8	d24, {d20-d21}, d24
+	vtbl.8	d25, {d20-d21}, d25
+
+	/* q13 := jakr = 1/(1/j + a/k) */
+	vtbl.8	d26, {d20-d21}, d26
+	vtbl.8	d27, {d20-d21}, d27
+
+	/* q2 := io = j + 1/(1/i + a/k) */
+	veor	q2, q2, q12
+
+	/* q3 := jo = i + 1/(1/j + a/k) */
+	veor	q3, q3, q13
+
+	/* advance round */
+	subs	r1, r1, #1
+	bne	1b
+
+	/* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
+	ldr	r8, =sr
+	ldr	r6, =dsbo_0
+	ldr	r7, =dsbo_1
+	add	r8, r8, r3, lsl #4
+	vld1.64	{d12-d13}, [r6 :128]
+	vld1.64	{d14-d15}, [r7 :128]
+	vld1.64	{d30-d31}, [r8 :128]
+
+	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+
+	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
+	vtbl.8	d4, {d12-d13}, d4
+	vtbl.8	d5, {d12-d13}, d5
+	vtbl.8	d6, {d14-d15}, d6
+	vtbl.8	d7, {d14-d15}, d7
+
+	/* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
+	veor	q2, q2, q14
+	veor	q2, q2, q3
+
+	/* q0 := x(sr[i]) */
+	vtbl.8	d0, {d4-d5}, d30
+	vtbl.8	d1, {d4-d5}, d31
+
+	vpop	{d8-d15}
+	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
+#ifdef _KERNEL
+	vmov	r0, r1, d0
+	vmov	r2, r3, d1
+#endif
+	bx	lr
+END(aes_neon_dec1)

Reply via email to