Module Name:    src
Committed By:   riastradh
Date:           Thu Sep 10 11:29:02 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Issue 256-bit loads rather than pairs of 128-bit loads.

Not sure why I didn't realize you could do this before!

Saves some temporary registers that can now be allocated to shave off
a few cycles.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6	Sun Aug 16 18:02:03 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Thu Sep 10 11:29:02 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include <arm/asm.h>
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
 
 	.fpu	neon
 
@@ -38,9 +38,10 @@ RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020
 	.long	.Lconstants - .
 
 	.section .rodata
-	.p2align 4
+	.p2align 5
 .Lconstants:
 
+.Linv_inva:	/* inv and inva must be consecutive */
 	.type	inv,_ASM_TYPE_OBJECT
 inv:
 	.byte	0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
@@ -99,125 +100,85 @@ sr:
 	.byte	0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
 END(sr)
 
-	.type	iptlo,_ASM_TYPE_OBJECT
-iptlo:
-	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+	.type	ipt,_ASM_TYPE_OBJECT
+ipt:
+	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2	/* lo */
 	.byte	0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
-END(iptlo)
-
-	.type	ipthi,_ASM_TYPE_OBJECT
-ipthi:
-	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
 	.byte	0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
-END(ipthi)
+END(ipt)
 
-	.type	sb1_0,_ASM_TYPE_OBJECT
-sb1_0:
-	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+	.type	sb1,_ASM_TYPE_OBJECT
+sb1:
+	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
 	.byte	0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
-END(sb1_0)
-
-	.type	sb1_1,_ASM_TYPE_OBJECT
-sb1_1:
-	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
 	.byte	0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
-END(sb1_1)
+END(sb1)
 
-	.type	sb2_0,_ASM_TYPE_OBJECT
-sb2_0:
-	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
+	.type	sb2,_ASM_TYPE_OBJECT
+sb2:
+	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
 	.byte	0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
-END(sb2_0)
-
-	.type	sb2_1,_ASM_TYPE_OBJECT
-sb2_1:
-	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
 	.byte	0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
-END(sb2_1)
+END(sb2)
 
-	.type	sbo_0,_ASM_TYPE_OBJECT
-sbo_0:
-	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
+	.type	sbo,_ASM_TYPE_OBJECT
+sbo:
+	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
 	.byte	0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
-END(sbo_0)
-
-	.type	sbo_1,_ASM_TYPE_OBJECT
-sbo_1:
-	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
 	.byte	0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
-END(sbo_1)
+END(sbo)
 
-	.type	diptlo,_ASM_TYPE_OBJECT
-diptlo:
-	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+	.type	dipt,_ASM_TYPE_OBJECT
+dipt:
+	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F	/* lo */
 	.byte	0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
-END(diptlo)
-
-	.type	dipthi,_ASM_TYPE_OBJECT
-dipthi:
-	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86	/* hi */
 	.byte	0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
-END(dipthi)
+END(dipt)
 
-	.type	dsb9_0,_ASM_TYPE_OBJECT
-dsb9_0:
-	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
+	.type	dsb9,_ASM_TYPE_OBJECT
+dsb9:
+	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85	/* 0 */
 	.byte	0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
-END(dsb9_0)
-
-	.type	dsb9_1,_ASM_TYPE_OBJECT
-dsb9_1:
-	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
+	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0	/* 1 */
 	.byte	0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
-END(dsb9_1)
+END(dsb9)
 
-	.type	dsbd_0,_ASM_TYPE_OBJECT
-dsbd_0:
-	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
+	.type	dsbd,_ASM_TYPE_OBJECT
+dsbd:
+	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D	/* 0 */
 	.byte	0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
-END(dsbd_0)
-
-	.type	dsbd_1,_ASM_TYPE_OBJECT
-dsbd_1:
-	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
+	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C	/* 1 */
 	.byte	0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
-END(dsbd_1)
+END(dsbd)
 
-	.type	dsbb_0,_ASM_TYPE_OBJECT
-dsbb_0:
-	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
+	.type	dsbb,_ASM_TYPE_OBJECT
+dsbb:
+	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0	/* 0 */
 	.byte	0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
-END(dsbb_0)
-
-	.type	dsbb_1,_ASM_TYPE_OBJECT
-dsbb_1:
-	.byte	0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
+	.byte	0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1	/* 1 */
 	.byte	0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
-END(dsbb_1)
+END(dsbb)
 
-	.type	dsbe_0,_ASM_TYPE_OBJECT
-dsbe_0:
-	.byte	0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
+	.type	dsbe,_ASM_TYPE_OBJECT
+dsbe:
+	.byte	0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46	/* 0 */
 	.byte	0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
-END(dsbe_0)
-
-	.type	dsbe_1,_ASM_TYPE_OBJECT
-dsbe_1:
-	.byte	0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
+	.byte	0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C	/* 1 */
 	.byte	0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
-END(dsbe_1)
+END(dsbe)
 
-	.type	dsbo_0,_ASM_TYPE_OBJECT
-dsbo_0:
-	.byte	0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
+	.type	dsbo,_ASM_TYPE_OBJECT
+dsbo:
+	.byte	0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13	/* 0 */
 	.byte	0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
-END(dsbo_0)
-
-	.type	dsbo_1,_ASM_TYPE_OBJECT
-dsbo_1:
-	.byte	0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
+	.byte	0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12	/* 1 */
 	.byte	0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
-END(dsbo_1)
+END(dsbo)
 
 /*
  * aes_neon_enc1(enc, x, nrounds)
@@ -274,7 +235,7 @@ ENTRY(aes_neon_enc1)
 	ldr	r12, .Lconstants_addr
 	adr	r11, .Lconstants_addr
 
-	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 	movw	r3, #0
 	vmov.i8	q1, #0x0f
 
@@ -282,24 +243,16 @@ ENTRY(aes_neon_enc1)
 	add	r12, r12, r11
 
 	/* (q4, q5) := (iptlo, ipthi) */
-	add	r6, r12, #(iptlo - .Lconstants)
-	add	r7, r12, #(ipthi - .Lconstants)
-	vld1.8	{d8-d9}, [r6 :128]
-	vld1.8	{d10-d11}, [r7 :128]
+	add	r6, r12, #(ipt - .Lconstants)
+	vld1.8	{q4-q5}, [r6 :256]
 
 	/* load the rest of the constants */
-	add	r4, r12, #(sb1_0 - .Lconstants)
-	add	r5, r12, #(sb1_1 - .Lconstants)
-	add	r6, r12, #(sb2_0 - .Lconstants)
-	add	r7, r12, #(sb2_1 - .Lconstants)
-	add	r8, r12, #(inv - .Lconstants)
-	add	r10, r12, #(inva - .Lconstants)
-	vld1.8	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
-	vld1.8	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
-	vld1.8	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
-	vld1.8	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
-	vld1.8	{d20-d21}, [r8 :128]	/* q10 = inv */
-	vld1.8	{d22-d23}, [r10 :128]	/* q11 = inva */
+	add	r4, r12, #(sb1 - .Lconstants)
+	add	r6, r12, #(sb2 - .Lconstants)
+	add	r8, r12, #(.Linv_inva - .Lconstants)
+	vld1.8	{q6-q7}, [r4 :256]	/* q6 = sb1[0], q7 = sb1[1] */
+	vld1.8	{q8-q9}, [r6 :256]	/* q8 = sb2[0], q9 = sb2[1] */
+	vld1.8	{q10-q11}, [r8 :256]	/* q10 = inv, q11 = inva */
 
 	/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
 	add	r4, r12, #(mc_forward - .Lconstants)
@@ -323,7 +276,7 @@ ENTRY(aes_neon_enc1)
 	b	2f
 
 	_ALIGN_TEXT
-1:	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+1:	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
 	vtbl.8	d24, {d12-d13}, d4
@@ -343,8 +296,8 @@ ENTRY(aes_neon_enc1)
 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
 	add	r6, r4, r3, lsl #4
 	add	r7, r5, r3, lsl #4
-	vld1.8	{d24-d25}, [r6]
-	vld1.8	{d26-d27}, [r7]
+	vld1.8	{q12}, [r6 :128]
+	vld1.8	{q13}, [r7 :128]
 
 	/* q15 := A2_B = A2 + A(mcf) */
 	vtbl.8	d30, {d0-d1}, d24
@@ -413,14 +366,12 @@ ENTRY(aes_neon_enc1)
 
 	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
 	add	r8, r12, #(sr - .Lconstants)
-	add	r6, r12, #(sbo_0 - .Lconstants)
-	add	r7, r12, #(sbo_1 - .Lconstants)
+	add	r6, r12, #(sbo - .Lconstants)
 	add	r8, r8, r3, lsl #4
-	vld1.8	{d12-d13}, [r6 :128]
-	vld1.8	{d14-d15}, [r7 :128]
-	vld1.8	{d30-d31}, [r8 :128]
+	vld1.8	{q6-q7}, [r6 :256]
+	vld1.8	{q15}, [r8 :128]
 
-	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
 	vtbl.8	d4, {d12-d13}, d4
@@ -502,7 +453,7 @@ ENTRY(aes_neon_dec1)
 	ldr	r12, .Lconstants_addr
 	adr	r11, .Lconstants_addr
 
-	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
 	vmov.i8	q1, #0x0f
 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
@@ -511,22 +462,16 @@ ENTRY(aes_neon_dec1)
 	add	r12, r12, r11
 
 	/* (q4, q5) := (diptlo, dipthi) */
-	add	r6, r12, #(diptlo - .Lconstants)
-	add	r7, r12, #(dipthi - .Lconstants)
-	vld1.8	{d8-d9}, [r6 :128]
-	vld1.8	{d10-d11}, [r7 :128]
+	add	r6, r12, #(dipt - .Lconstants)
+	vld1.8	{q4-q5}, [r6 :256]
 
 	/* load the rest of the constants */
-	add	r4, r12, #(dsbb_0 - .Lconstants)
-	add	r5, r12, #(dsbb_1 - .Lconstants)
-	add	r6, r12, #(inv - .Lconstants)
-	add	r7, r12, #(inva - .Lconstants)
+	add	r4, r12, #(dsbb - .Lconstants)
+	add	r6, r12, #(.Linv_inva - .Lconstants)
 	add	r8, r12, #(.Lmc_forward_3 - .Lconstants)
-	vld1.8	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
-	vld1.8	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
-	vld1.8	{d20-d21}, [r6 :128]	/* q10 := inv */
-	vld1.8	{d22-d23}, [r7 :128]	/* q11 := inva */
-	vld1.8	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
+	vld1.8	{q6-q7}, [r4 :256]	/* q6 := dsbb[0], q7 := dsbb[1] */
+	vld1.8	{q10-q11}, [r6 :256]	/* q10 := inv, q11 := inva */
+	vld1.8	{q15}, [r8 :128]	/* q15 := mc_forward[3] */
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4
@@ -540,10 +485,8 @@ ENTRY(aes_neon_dec1)
 	vtbl.8	d7, {d10-d11}, d7
 
 	/* load dsb9 */
-	add	r4, r12, #(dsb9_0 - .Lconstants)
-	add	r5, r12, #(dsb9_1 - .Lconstants)
-	vld1.8	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
-	vld1.8	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
+	add	r4, r12, #(dsb9 - .Lconstants)
+	vld1.8	{q4-q5}, [r4 :256]	/* q4 := dsb9[0], q5 := dsb9[1] */
 
 	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
 	veor	q0, q14, q2
@@ -553,11 +496,10 @@ ENTRY(aes_neon_dec1)
 
 	_ALIGN_TEXT
 1:	/* load dsbd */
-	add	r4, r12, #(dsbd_0 - .Lconstants)
-	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
-	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
+	add	r4, r12, #(dsbd - .Lconstants)
+	vld1.8	{q8-q9}, [r4 :256]	/* q8 := dsbd[0], q9 := dsbd[1] */
 
-	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
 	vtbl.8	d24, {d8-d9}, d4
@@ -580,9 +522,8 @@ ENTRY(aes_neon_dec1)
 	veor	q0, q0, q13
 
 	/* load dsbe */
-	add	r4, r12, #(dsbe_0 - .Lconstants)
-	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
-	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
+	add	r4, r12, #(dsbe - .Lconstants)
+	vld1.8	{q8-q9}, [r4 :256]!	/* q8 := dsbe[0], q9 := dsbe[1] */
 
 	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
 	vtbl.8	d28, {d0-d1}, d30
@@ -657,14 +598,12 @@ ENTRY(aes_neon_dec1)
 
 	/* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
 	add	r8, r12, #(sr - .Lconstants)
-	add	r6, r12, #(dsbo_0 - .Lconstants)
-	add	r7, r12, #(dsbo_1 - .Lconstants)
+	add	r6, r12, #(dsbo - .Lconstants)
 	add	r8, r8, r3, lsl #4
-	vld1.8	{d12-d13}, [r6 :128]
-	vld1.8	{d14-d15}, [r7 :128]
-	vld1.8	{d30-d31}, [r8 :128]
+	vld1.8	{q6-q7}, [r6 :256]
+	vld1.8	{q15}, [r8 :128]
 
-	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
 	vtbl.8	d4, {d12-d13}, d4

Reply via email to