Module Name: src Committed By: riastradh Date: Thu Sep 10 11:29:02 UTC 2020
Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Issue 256-bit loads rather than pairs of 128-bit loads. Not sure why I didn't realize you could do this before! Saves some temporary registers that can now be allocated to shave off a few cycles. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 Sun Aug 16 18:02:03 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:29:02 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include <arm/asm.h> -RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $") .fpu neon @@ -38,9 +38,10 @@ RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020 .long .Lconstants - . .section .rodata - .p2align 4 + .p2align 5 .Lconstants: +.Linv_inva: /* inv and inva must be consecutive */ .type inv,_ASM_TYPE_OBJECT inv: .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E @@ -99,125 +100,85 @@ sr: .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03 END(sr) - .type iptlo,_ASM_TYPE_OBJECT -iptlo: - .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 + .type ipt,_ASM_TYPE_OBJECT +ipt: + .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 /* lo */ .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA -END(iptlo) - - .type ipthi,_ASM_TYPE_OBJECT -ipthi: - .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C + .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */ .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD -END(ipthi) +END(ipt) - .type sb1_0,_ASM_TYPE_OBJECT -sb1_0: - .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 + .type sb1,_ASM_TYPE_OBJECT +sb1: + .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */ .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5 -END(sb1_0) - - .type sb1_1,_ASM_TYPE_OBJECT -sb1_1: - .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 + .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */ .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B -END(sb1_1) +END(sb1) - .type sb2_0,_ASM_TYPE_OBJECT -sb2_0: - .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 + .type sb2,_ASM_TYPE_OBJECT +sb2: + .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */ .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E -END(sb2_0) - - .type sb2_1,_ASM_TYPE_OBJECT -sb2_1: - .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 + .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */ .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2 -END(sb2_1) +END(sb2) - .type sbo_0,_ASM_TYPE_OBJECT -sbo_0: - .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 + .type sbo,_ASM_TYPE_OBJECT +sbo: + .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */ .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15 -END(sbo_0) - - .type sbo_1,_ASM_TYPE_OBJECT -sbo_1: - .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF + .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */ .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E -END(sbo_1) +END(sbo) - .type diptlo,_ASM_TYPE_OBJECT -diptlo: - .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F + .type dipt,_ASM_TYPE_OBJECT +dipt: + .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F /* lo */ .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15 -END(diptlo) - - .type dipthi,_ASM_TYPE_OBJECT -dipthi: - .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 + .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 /* hi */ .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12 -END(dipthi) +END(dipt) - .type dsb9_0,_ASM_TYPE_OBJECT -dsb9_0: - .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 + .type dsb9,_ASM_TYPE_OBJECT +dsb9: + .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 /* 0 */ .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA -END(dsb9_0) - - .type dsb9_1,_ASM_TYPE_OBJECT -dsb9_1: - .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 + .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 /* 1 */ .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72 -END(dsb9_1) +END(dsb9) - .type dsbd_0,_ASM_TYPE_OBJECT -dsbd_0: - .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D + .type dsbd,_ASM_TYPE_OBJECT +dsbd: + .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D /* 0 */ .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5 -END(dsbd_0) - - .type dsbd_1,_ASM_TYPE_OBJECT -dsbd_1: - .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C + .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C /* 1 */ .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29 -END(dsbd_1) +END(dsbd) - .type dsbb_0,_ASM_TYPE_OBJECT -dsbb_0: - .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0 + .type dsbb,_ASM_TYPE_OBJECT +dsbb: + .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0 /* 0 */ .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60 -END(dsbb_0) - - .type dsbb_1,_ASM_TYPE_OBJECT -dsbb_1: - .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1 + .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1 /* 1 */ .byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3 -END(dsbb_1) +END(dsbb) - .type dsbe_0,_ASM_TYPE_OBJECT -dsbe_0: - .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46 + .type dsbe,_ASM_TYPE_OBJECT +dsbe: + .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46 /* 0 */ .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22 -END(dsbe_0) - - .type dsbe_1,_ASM_TYPE_OBJECT -dsbe_1: - .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C + .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C /* 1 */ .byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94 -END(dsbe_1) +END(dsbe) - .type dsbo_0,_ASM_TYPE_OBJECT -dsbo_0: - .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13 + .type dsbo,_ASM_TYPE_OBJECT +dsbo: + .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13 /* 0 */ .byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7 -END(dsbo_0) - - .type dsbo_1,_ASM_TYPE_OBJECT -dsbo_1: - .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12 + .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12 /* 1 */ .byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA -END(dsbo_1) +END(dsbo) /* * aes_neon_enc1(enc, x, nrounds) @@ -274,7 +235,7 @@ ENTRY(aes_neon_enc1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ movw r3, #0 vmov.i8 q1, #0x0f @@ -282,24 +243,16 @@ ENTRY(aes_neon_enc1) add r12, r12, r11 /* (q4, q5) := (iptlo, ipthi) */ - add r6, r12, #(iptlo - .Lconstants) - add r7, r12, #(ipthi - .Lconstants) - vld1.8 {d8-d9}, [r6 :128] - vld1.8 {d10-d11}, [r7 :128] + add r6, r12, #(ipt - .Lconstants) + vld1.8 {q4-q5}, [r6 :256] /* load the rest of the constants */ - add r4, r12, #(sb1_0 - .Lconstants) - add r5, r12, #(sb1_1 - .Lconstants) - add r6, r12, #(sb2_0 - .Lconstants) - add r7, r12, #(sb2_1 - .Lconstants) - add r8, r12, #(inv - .Lconstants) - add r10, r12, #(inva - .Lconstants) - vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ - vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ - vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ - vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ - vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */ - vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */ + add r4, r12, #(sb1 - .Lconstants) + add r6, r12, #(sb2 - .Lconstants) + add r8, r12, #(.Linv_inva - .Lconstants) + vld1.8 {q6-q7}, [r4 :256] /* q6 = sb1[0], q7 = sb1[1] */ + vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */ + vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */ /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */ add r4, r12, #(mc_forward - .Lconstants) @@ -323,7 +276,7 @@ ENTRY(aes_neon_enc1) b 2f _ALIGN_TEXT -1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ +1: vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */ vtbl.8 d24, {d12-d13}, d4 @@ -343,8 +296,8 @@ ENTRY(aes_neon_enc1) /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ add r6, r4, r3, lsl #4 add r7, r5, r3, lsl #4 - vld1.8 {d24-d25}, [r6] - vld1.8 {d26-d27}, [r7] + vld1.8 {q12}, [r6 :128] + vld1.8 {q13}, [r7 :128] /* q15 := A2_B = A2 + A(mcf) */ vtbl.8 d30, {d0-d1}, d24 @@ -413,14 +366,12 @@ ENTRY(aes_neon_enc1) /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */ add r8, r12, #(sr - .Lconstants) - add r6, r12, #(sbo_0 - .Lconstants) - add r7, r12, #(sbo_1 - .Lconstants) + add r6, r12, #(sbo - .Lconstants) add r8, r8, r3, lsl #4 - vld1.8 {d12-d13}, [r6 :128] - vld1.8 {d14-d15}, [r7 :128] - vld1.8 {d30-d31}, [r8 :128] + vld1.8 {q6-q7}, [r6 :256] + vld1.8 {q15}, [r8 :128] - vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 @@ -502,7 +453,7 @@ ENTRY(aes_neon_dec1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ vmov.i8 q1, #0x0f and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ @@ -511,22 +462,16 @@ ENTRY(aes_neon_dec1) add r12, r12, r11 /* (q4, q5) := (diptlo, dipthi) */ - add r6, r12, #(diptlo - .Lconstants) - add r7, r12, #(dipthi - .Lconstants) - vld1.8 {d8-d9}, [r6 :128] - vld1.8 {d10-d11}, [r7 :128] + add r6, r12, #(dipt - .Lconstants) + vld1.8 {q4-q5}, [r6 :256] /* load the rest of the constants */ - add r4, r12, #(dsbb_0 - .Lconstants) - add r5, r12, #(dsbb_1 - .Lconstants) - add r6, r12, #(inv - .Lconstants) - add r7, r12, #(inva - .Lconstants) + add r4, r12, #(dsbb - .Lconstants) + add r6, r12, #(.Linv_inva - .Lconstants) add r8, r12, #(.Lmc_forward_3 - .Lconstants) - vld1.8 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ - vld1.8 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ - vld1.8 {d20-d21}, [r6 :128] /* q10 := inv */ - vld1.8 {d22-d23}, [r7 :128] /* q11 := inva */ - vld1.8 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ + vld1.8 {q6-q7}, [r4 :256] /* q6 := dsbb[0], q7 := dsbb[1] */ + vld1.8 {q10-q11}, [r6 :256] /* q10 := inv, q11 := inva */ + vld1.8 {q15}, [r8 :128] /* q15 := mc_forward[3] */ /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -540,10 +485,8 @@ ENTRY(aes_neon_dec1) vtbl.8 d7, {d10-d11}, d7 /* load dsb9 */ - add r4, r12, #(dsb9_0 - .Lconstants) - add r5, r12, #(dsb9_1 - .Lconstants) - vld1.8 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ - vld1.8 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ + add r4, r12, #(dsb9 - .Lconstants) + vld1.8 {q4-q5}, [r4 :256] /* q4 := dsb9[0], q5 := dsb9[1] */ /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */ veor q0, q14, q2 @@ -553,11 +496,10 @@ ENTRY(aes_neon_dec1) _ALIGN_TEXT 1: /* load dsbd */ - add r4, r12, #(dsbd_0 - .Lconstants) - vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ - vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ + add r4, r12, #(dsbd - .Lconstants) + vld1.8 {q8-q9}, [r4 :256] /* q8 := dsbd[0], q9 := dsbd[1] */ - vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */ vtbl.8 d24, {d8-d9}, d4 @@ -580,9 +522,8 @@ ENTRY(aes_neon_dec1) veor q0, q0, q13 /* load dsbe */ - add r4, r12, #(dsbe_0 - .Lconstants) - vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ - vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ + add r4, r12, #(dsbe - .Lconstants) + vld1.8 {q8-q9}, [r4 :256]! /* q8 := dsbe[0], q9 := dsbe[1] */ /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */ vtbl.8 d28, {d0-d1}, d30 @@ -657,14 +598,12 @@ ENTRY(aes_neon_dec1) /* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */ add r8, r12, #(sr - .Lconstants) - add r6, r12, #(dsbo_0 - .Lconstants) - add r7, r12, #(dsbo_1 - .Lconstants) + add r6, r12, #(dsbo - .Lconstants) add r8, r8, r3, lsl #4 - vld1.8 {d12-d13}, [r6 :128] - vld1.8 {d14-d15}, [r7 :128] - vld1.8 {d30-d31}, [r8 :128] + vld1.8 {q6-q7}, [r6 :256] + vld1.8 {q15}, [r8 :128] - vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4