Module Name: src Committed By: riastradh Date: Sat Aug 8 14:47:01 UTC 2020
Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S aes_neon.c aes_neon_32.S aes_neon_impl.h aes_neon_subr.c arm_neon.h src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c chacha_neon_32.S chacha_neon_64.S Added Files: src/sys/crypto/aes/arch/arm: arm_neon_imm.h src/sys/crypto/chacha/arch/arm: arm_neon_imm.h Log Message: Fix ARM NEON implementations of AES and ChaCha on big-endian ARM. New macros such as VQ_N_U32(a,b,c,d) for NEON vector initializers. Needed because GCC and Clang disagree on the ordering of lanes, depending on whether it's 64-bit big-endian, 32-bit big-endian, or little-endian -- and, bizarrely, both of them disagree with the architectural numbering of lanes. Experimented with using static const uint8_t x8[16] = {...}; uint8x16_t x = vld1q_u8(x8); which doesn't require knowing anything about the ordering of lanes, but this generates considerably worse code and apparently confuses GCC into not recognizing the constant value of x8. Fix some clang mistakes while here too. To generate a diff of this commit: cvs rdiff -u -r1.11 -r1.12 src/sys/crypto/aes/arch/arm/aes_armv8_64.S cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/aes_neon_32.S \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon_impl.h cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/arm_neon.h cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/arm_neon_imm.h cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/arm_neon.h cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/arm/arm_neon_imm.h cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/chacha/arch/arm/chacha_neon.c cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.11 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.12 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.11 Mon Jul 27 20:57:23 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -26,11 +26,9 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/endian.h> - #include <aarch64/asm.h> -RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $") +RCSID("$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $") .arch_extension aes @@ -921,19 +919,13 @@ ENTRY(aesarmv8_ccm_enc1) ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ -#endif _ALIGN_TEXT 1: ldr q3, [x1], #0x10 /* q3 := plaintext block */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ -#else - mov v1.16b, v2.16b /* q1 := ctr (big-endian) */ -#endif eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc2 /* q0 := auth', q1 := pad; * trash x0/x3/q16 */ @@ -941,9 +933,7 @@ ENTRY(aesarmv8_ccm_enc1) subs x10, x10, #0x10 /* count down bytes */ str q3, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if more blocks */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ -#endif stp q0, q2, [x4] /* store updated auth/ctr */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret @@ -968,18 +958,12 @@ ENTRY(aesarmv8_ccm_dec1) ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ -#endif /* Decrypt the first block. */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x3, x5 /* x3 := nrounds */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#else - mov v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#endif ldr q3, [x1], #0x10 /* q3 := ctxt */ bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */ b 2f @@ -995,11 +979,7 @@ ENTRY(aesarmv8_ccm_dec1) add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#else - mov v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#endif ldr q3, [x1], #0x10 /* q3 := ctxt */ bl aesarmv8_enc2 /* q0 := pad, q1 := auth'; * trash x0/x3/q16 */ @@ -1009,15 +989,14 @@ ENTRY(aesarmv8_ccm_dec1) eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */ b.ne 1b -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ -#endif /* Authenticate the last block. */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ mov v0.16b, v1.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */ + stp q0, q2, [x4] /* store updated auth/ctr */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret Index: src/sys/crypto/aes/arch/arm/aes_neon.c diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.4 src/sys/crypto/aes/arch/arm/aes_neon.c:1.5 --- src/sys/crypto/aes/arch/arm/aes_neon.c:1.4 Tue Jul 28 20:11:09 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.c Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ +/* $NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,7 +39,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $"); #include <sys/types.h> @@ -60,141 +60,141 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v static const uint8x16_t mc_forward[4] = { - {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, - 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C}, - {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, - 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00}, - {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, - 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04}, - {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, - 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08}, + VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, + 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C), + VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, + 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00), + VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, + 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04), + VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, + 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08), }, mc_backward[4] __aarch64_used = { - {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, - 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E}, - {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, - 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A}, - {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, - 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06}, - {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, - 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02}, + VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, + 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E), + VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, + 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A), + VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, + 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06), + VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, + 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02), }, ipt[2] __aarch64_used = { - {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, - 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA}, - {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, - 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD}, + VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, + 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA), + VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, + 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD), }, opt[2] = { - {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, - 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7}, - {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, - 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1}, + VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, + 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7), + VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, + 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1), }, dipt[2] __aarch64_used = { - {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, - 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15}, - {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, - 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12}, + VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, + 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15), + VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, + 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12), }, sb1[2] __aarch64_used = { - {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, - 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5}, - {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, - 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B}, + VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, + 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5), + VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, + 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B), }, sb2[2] __aarch64_used = { - {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, - 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E}, - {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, - 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2}, + VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, + 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E), + VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, + 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2), }, sbo[2] __aarch64_used = { - {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, - 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15}, - {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, - 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E}, + VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, + 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15), + VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, + 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E), }, dsb9[2] __aarch64_used = { - {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, - 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA}, - {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, - 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72}, + VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, + 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA), + VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, + 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72), }, dsbd[2] __aarch64_used = { - {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, - 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5}, - {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, - 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29}, + VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, + 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5), + VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, + 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29), }, dsbb[2] __aarch64_used = { - {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, - 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60}, - {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, - 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3}, + VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, + 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60), + VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, + 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3), }, dsbe[2] __aarch64_used = { - {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, - 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22}, - {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, - 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94}, + VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, + 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22), + VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, + 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94), }, dsbo[2] __aarch64_used = { - {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, - 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7}, - {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, - 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA}, + VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, + 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7), + VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, + 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA), }, dks1[2] = { - {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, - 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A}, - {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, - 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B}, + VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, + 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A), + VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, + 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B), }, dks2[2] = { - {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, - 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46}, - {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, - 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73}, + VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, + 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46), + VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, + 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73), }, dks3[2] = { - {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, - 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8}, - {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, - 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5}, + VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, + 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8), + VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, + 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5), }, dks4[2] = { - {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, - 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0}, - {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, - 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F}, + VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, + 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0), + VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, + 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F), }, deskew[2] = { - {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, - 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D}, - {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, - 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28}, + VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, + 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D), + VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, + 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28), }, sr[4] __aarch64_used = { - {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, - 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}, - {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, - 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B}, - {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, - 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07}, - {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, - 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03}, -}, -rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, - 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70}, -s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, - 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B}, -of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, - 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F}, -inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, - 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04}, -inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, - 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03}; + VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, + 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F), + VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, + 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B), + VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, + 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07), + VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, + 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03), +}, +rcon = VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, + 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70), +of = VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, + 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F), +s63 = VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, + 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B), +inv = VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, + 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04), +inva = VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, + 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03); static inline uint8x16_t loadroundkey(const void *rkp) Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.4 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.5 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.4 Mon Jul 27 20:57:23 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include <arm/asm.h> -RCSID("$NetBSD: aes_neon_32.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $") .fpu neon @@ -270,7 +270,7 @@ ENTRY(aes_neon_enc1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ movw r3, #0 vmov.i8 q1, #0x0f @@ -280,8 +280,8 @@ ENTRY(aes_neon_enc1) /* (q4, q5) := (iptlo, ipthi) */ add r6, r12, #(iptlo - .Lconstants) add r7, r12, #(ipthi - .Lconstants) - vld1.64 {d8-d9}, [r6 :128] - vld1.64 {d10-d11}, [r7 :128] + vld1.8 {d8-d9}, [r6 :128] + vld1.8 {d10-d11}, [r7 :128] /* load the rest of the constants */ add r4, r12, #(sb1_0 - .Lconstants) @@ -290,12 +290,12 @@ ENTRY(aes_neon_enc1) add r7, r12, #(sb2_1 - .Lconstants) add r8, r12, #(inv - .Lconstants) add r10, r12, #(inva - .Lconstants) - vld1.64 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ - vld1.64 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ - vld1.64 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ - vld1.64 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ - vld1.64 {d20-d21}, [r8 :128] /* q10 = inv */ - vld1.64 {d22-d23}, [r10 :128] /* q11 = inva */ + vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ + vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ + vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ + vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ + vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */ + vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */ /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */ add r4, r12, #(mc_forward - .Lconstants) @@ -319,7 +319,7 @@ ENTRY(aes_neon_enc1) b 2f _ALIGN_TEXT -1: vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ +1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */ vtbl.8 d24, {d12-d13}, d4 @@ -339,8 +339,8 @@ ENTRY(aes_neon_enc1) /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ add r6, r4, r3, lsl #4 add r7, r5, r3, lsl #4 - vld1.64 {d24-d25}, [r6] - vld1.64 {d26-d27}, [r7] + vld1.8 {d24-d25}, [r6] + vld1.8 {d26-d27}, [r7] /* q15 := A2_B = A2 + A(mcf) */ vtbl.8 d30, {d0-d1}, d24 @@ -412,11 +412,11 @@ ENTRY(aes_neon_enc1) add r6, r12, #(sbo_0 - .Lconstants) add r7, r12, #(sbo_1 - .Lconstants) add r8, r8, r3, lsl #4 - vld1.64 {d12-d13}, [r6 :128] - vld1.64 {d14-d15}, [r7 :128] - vld1.64 {d30-d31}, [r8 :128] + vld1.8 {d12-d13}, [r6 :128] + vld1.8 {d14-d15}, [r7 :128] + vld1.8 {d30-d31}, [r8 :128] - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 @@ -489,7 +489,7 @@ ENTRY(aes_neon_dec1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ vmov.i8 q1, #0x0f and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ @@ -500,8 +500,8 @@ ENTRY(aes_neon_dec1) /* (q4, q5) := (diptlo, dipthi) */ add r6, r12, #(diptlo - .Lconstants) add r7, r12, #(dipthi - .Lconstants) - vld1.64 {d8-d9}, [r6 :128] - vld1.64 {d10-d11}, [r7 :128] + vld1.8 {d8-d9}, [r6 :128] + vld1.8 {d10-d11}, [r7 :128] /* load the rest of the constants */ add r4, r12, #(dsbb_0 - .Lconstants) @@ -509,11 +509,11 @@ ENTRY(aes_neon_dec1) add r6, r12, #(inv - .Lconstants) add r7, r12, #(inva - .Lconstants) add r8, r12, #(.Lmc_forward_3 - .Lconstants) - vld1.64 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ - vld1.64 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ - vld1.64 {d20-d21}, [r6 :128] /* q10 := inv */ - vld1.64 {d22-d23}, [r7 :128] /* q11 := inva */ - vld1.64 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ + vld1.8 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ + vld1.8 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ + vld1.8 {d20-d21}, [r6 :128] /* q10 := inv */ + vld1.8 {d22-d23}, [r7 :128] /* q11 := inva */ + vld1.8 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -529,8 +529,8 @@ ENTRY(aes_neon_dec1) /* load dsb9 */ add r4, r12, #(dsb9_0 - .Lconstants) add r5, r12, #(dsb9_1 - .Lconstants) - vld1.64 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ - vld1.64 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ + vld1.8 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ + vld1.8 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */ veor q0, q14, q2 @@ -541,10 +541,10 @@ ENTRY(aes_neon_dec1) _ALIGN_TEXT 1: /* load dsbd */ add r4, r12, #(dsbd_0 - .Lconstants) - vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ - vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ + vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ + vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */ vtbl.8 d24, {d8-d9}, d4 @@ -568,8 +568,8 @@ ENTRY(aes_neon_dec1) /* load dsbe */ add r4, r12, #(dsbe_0 - .Lconstants) - vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ - vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ + vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ + vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */ vtbl.8 d28, {d0-d1}, d30 @@ -647,11 +647,11 @@ ENTRY(aes_neon_dec1) add r6, r12, #(dsbo_0 - .Lconstants) add r7, r12, #(dsbo_1 - .Lconstants) add r8, r8, r3, lsl #4 - vld1.64 {d12-d13}, [r6 :128] - vld1.64 {d14-d15}, [r7 :128] - vld1.64 {d30-d31}, [r8 :128] + vld1.8 {d12-d13}, [r6 :128] + vld1.8 {d14-d15}, [r7 :128] + vld1.8 {d30-d31}, [r8 :128] - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.4 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.5 --- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.4 Tue Jul 28 20:11:09 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ +/* $NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,9 +27,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); - -#include <sys/endian.h> +__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $"); #ifdef _KERNEL #include <sys/systm.h> @@ -38,6 +36,7 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon_sub #include <assert.h> #include <inttypes.h> #include <stdio.h> +#include <string.h> #define KASSERT assert #endif @@ -144,7 +143,8 @@ static inline uint8x16_t aes_neon_xts_update(uint8x16_t t8) { const int32x4_t zero = vdupq_n_s32(0); - const int32x4_t carry = {0x87, 1, 1, 1}; + /* (0x87,1,1,1) */ + const uint32x4_t carry = vsetq_lane_u32(0x87, vdupq_n_u32(1), 0); int32x4_t t, t_; uint32x4_t mask; @@ -161,32 +161,36 @@ static int aes_neon_xts_update_selftest(void) { static const struct { - uint32_t in[4], out[4]; + uint8_t in[16], out[16]; } cases[] = { - [0] = { {1}, {2} }, - [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, - [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, - [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, - [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, - [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + [0] = { {1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}, + {2,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} }, + [1] = { {0,0,0,0x80, 0,0,0,0, 0,0,0,0, 0,0,0,0}, + {0,0,0,0, 1,0,0,0, 0,0,0,0, 0,0,0,0} }, + [2] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0}, + {0,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} }, + [3] = { {0,0,0,0, 0,0,0,0, 0,0,0,0x80, 0,0,0,0}, + {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0} }, + [4] = { {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80}, + {0x87,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} }, + [5] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0x80}, + {0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} }, }; unsigned i; - uint32_t t[4]; + uint8_t t[16]; int result = 0; for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { - t[0] = cases[i].in[0]; - t[1] = cases[i].in[1]; - t[2] = cases[i].in[2]; - t[3] = cases[i].in[3]; - storeblock(t, aes_neon_xts_update(loadblock(t))); - if (t[0] != cases[i].out[0] || - t[1] != cases[i].out[1] || - t[2] != cases[i].out[2] || - t[3] != cases[i].out[3]) { - printf("%s %u:" - " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", - __func__, i, t[0], t[1], t[2], t[3]); + storeblock(t, aes_neon_xts_update(loadblock(cases[i].in))); + if (memcmp(t, cases[i].out, 16)) { + char buf[33]; + unsigned j; + + for (j = 0; j < 16; j++) { + snprintf(buf + 2*j, sizeof(buf) - 2*j, + " %02hhx", t[j]); + } + printf("%s %u: %s\n", __func__, i, buf); result = -1; } } @@ -289,22 +293,13 @@ aes_neon_cbcmac_update1(const struct aes * function, which should substantially improve CCM throughput. */ -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define vbetoh32q_u8 vrev32q_u8 -#define vhtobe32q_u8 vrev32q_u8 -#elif _BYTE_ORDER == _BIG_ENDIAN -#define vbetoh32q_u8(x) (x) -#define vhtobe32q_u8(x) (x) -#else -#error what kind of endian are you anyway -#endif - void aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], uint32_t nrounds) { - const uint32x4_t ctr32_inc = {0, 0, 0, 1}; + /* (0,0,0,1) */ + const uint32x4_t ctr32_inc = vsetq_lane_u32(1, vdupq_n_u32(0), 3); uint8x16_t auth, ptxt, ctr_be; uint32x4_t ctr; @@ -313,12 +308,12 @@ aes_neon_ccm_enc1(const struct aesenc *e auth = loadblock(authctr); ctr_be = loadblock(authctr + 16); - ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); + ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be)); for (; nbytes; nbytes -= 16, in += 16, out += 16) { uint8x16x2_t b2; ptxt = loadblock(in); ctr = vaddq_u32(ctr, ctr32_inc); - ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr)); b2.val[0] = auth ^ ptxt; b2.val[1] = ctr_be; @@ -335,7 +330,8 @@ aes_neon_ccm_dec1(const struct aesenc *e uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], uint32_t nrounds) { - const uint32x4_t ctr32_inc = {0, 0, 0, 1}; + /* (0,0,0,1) */ + const uint32x4_t ctr32_inc = vsetq_lane_u32(1, vdupq_n_u32(0), 3); uint8x16_t auth, ctr_be, ptxt, pad; uint32x4_t ctr; @@ -343,9 +339,9 @@ aes_neon_ccm_dec1(const struct aesenc *e KASSERT(nbytes % 16 == 0); ctr_be = loadblock(authctr + 16); - ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); + ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be)); ctr = vaddq_u32(ctr, ctr32_inc); - ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr)); pad = aes_neon_enc1(enc, ctr_be, nrounds); auth = loadblock(authctr); for (;; in += 16, out += 16) { @@ -359,7 +355,7 @@ aes_neon_ccm_dec1(const struct aesenc *e break; ctr = vaddq_u32(ctr, ctr32_inc); - ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr)); b2.val[0] = auth; b2.val[1] = ctr_be; b2 = aes_neon_enc2(enc, b2, nrounds); Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.h diff -u src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.3 --- src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.2 Tue Jul 28 20:11:09 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */ +/* $NetBSD: aes_neon_impl.h,v 1.3 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -32,6 +32,7 @@ #include <sys/types.h> #include "arm_neon.h" +#include "arm_neon_imm.h" #include <crypto/aes/aes.h> #include <crypto/aes/arch/arm/aes_neon.h> Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.7 src/sys/crypto/aes/arch/arm/arm_neon.h:1.8 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.7 Tue Jul 28 20:11:09 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.8 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,6 +39,7 @@ typedef __Int32x4_t int32x4_t; typedef __Int64x2_t int64x2_t; typedef __Int8x16_t int8x16_t; +typedef __Uint16x8_t uint16x8_t; typedef __Uint32x4_t uint32x4_t; typedef __Uint64x2_t uint64x2_t; typedef __Uint8x16_t uint8x16_t; @@ -47,6 +48,7 @@ typedef struct { uint8x16_t val[2]; } ui typedef __simd128_int32_t int32x4_t; typedef __simd128_int64_t int64x2_t; typedef __simd128_int8_t int8x16_t; +typedef __simd128_uint16_t uint16x8_t; typedef __simd128_uint32_t uint32x4_t; typedef __simd128_uint64_t uint64x2_t; typedef __simd128_uint8_t uint8x16_t; @@ -58,10 +60,15 @@ typedef struct { uint8x8_t val[2]; } uin typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #endif -#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) -#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#if defined(__AARCH64EB__) +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i)) +#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i)) +#elif defined(__ARM_BIG_ENDIAN) +#define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1)) +#define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1)) #else -#define __neon_lane_index(__v, __i) __i +#define __neon_lane_index(__v, __i) (__i) +#define __neon_laneq_index(__v, __i) (__i) #endif #elif defined(__clang__) @@ -72,17 +79,23 @@ typedef struct { uint8x16_t val[2]; } ui typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; + typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; +typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; + typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #ifdef __LITTLE_ENDIAN__ #define __neon_lane_index(__v, __i) __i +#define __neon_laneq_index(__v, __i) __i #else #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i) #endif #else @@ -166,7 +179,8 @@ _INTRINSATTR static __inline uint8x16_t vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) { -#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) +#ifdef __aarch64__ +#if defined(__AARCH64EB__) return __builtin_shuffle(__hi, __lo, (uint8x16_t) { 16 - __i, 17 - __i, 18 - __i, 19 - __i, @@ -183,6 +197,10 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h __i + 12, __i + 13, __i + 14, __i + 15, }); #endif +#else + return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo, + (int8x16_t)__hi, __i); +#endif } #elif defined(__clang__) #ifdef __LITTLE_ENDIAN__ @@ -220,7 +238,7 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _ #elif defined(__clang__) #define vgetq_lane_u32(__v, __i) \ (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ - __neon_lane_index(__v, __i)) + __neon_laneq_index(__v, __i)) #endif _INTRINSATTR @@ -332,6 +350,27 @@ vreinterpretq_s32_u8(uint8x16_t __v) } _INTRINSATTR +static __inline uint16x8_t +vreinterpretq_u16_u32(uint32x4_t __v) +{ + return (uint16x8_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u16(uint16x8_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u64(uint64x2_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR static __inline uint32x4_t vreinterpretq_u32_u8(uint8x16_t __v) { @@ -340,6 +379,13 @@ vreinterpretq_u32_u8(uint8x16_t __v) _INTRINSATTR static __inline uint64x2_t +vreinterpretq_u64_u32(uint32x4_t __v) +{ + return (uint64x2_t)__v; +} + +_INTRINSATTR +static __inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t __v) { return (uint64x2_t)__v; @@ -367,6 +413,17 @@ vreinterpretq_u8_u64(uint64x2_t __v) } _INTRINSATTR +static __inline uint16x8_t +vrev32q_u16(uint16x8_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) + return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); +#elif defined(__clang__) + return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6); +#endif +} + +_INTRINSATTR static __inline uint8x16_t vrev32q_u8(uint8x16_t __v) { @@ -374,7 +431,7 @@ vrev32q_u8(uint8x16_t __v) return __builtin_shuffle(__v, (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); #elif defined(__clang__) - return __builtin_shufflevector(__v, + return __builtin_shufflevector(__v, __v, 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); #endif } @@ -384,13 +441,13 @@ _INTRINSATTR static __inline uint32x4_t vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) { - __v[__neon_lane_index(__v, __i)] = __x; + __v[__neon_laneq_index(__v, __i)] = __x; return __v; } #elif defined(__clang__) #define vsetq_lane_u32(__x, __v, __i) \ (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ - __neon_lane_index(__v, __i)) + __neon_laneq_index(__v, __i)) #endif #if defined(__GNUC__) && !defined(__clang__) @@ -398,13 +455,13 @@ _INTRINSATTR static __inline uint64x2_t vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) { - __v[__neon_lane_index(__v, __i)] = __x; + __v[__neon_laneq_index(__v, __i)] = __x; return __v; } #elif defined(__clang__) #define vsetq_lane_u64(__x, __v, __i) \ - (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ - __neon_lane_index(__v, __i)); + (uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \ + __neon_laneq_index(__v, __i)); #endif #if defined(__GNUC__) && !defined(__clang__) @@ -435,7 +492,7 @@ vshrq_n_u32(uint32x4_t __v, uint8_t __bi #endif } #elif defined(__clang__) -#define vshrq_n_u8(__v, __bits) \ +#define vshrq_n_u32(__v, __bits) \ (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) #endif @@ -488,6 +545,40 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t #endif /* __LITTLE_ENDIAN__ */ #endif +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) +{ +#ifdef __aarch64__ + return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); +#else + return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, + (int32x4_t)__vsh, __bits); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vsriq_n_u32(__vins, __vsh, __bits) \ + (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ + (int32x4_t)(__vsh), (__bits), 34) +#else +#define vsliq_n_s32(__vins, __vsh, __bits) ( \ +{ \ + int32x4_t __tvins = (__vins); \ + int32x4_t __tvsh = (__vsh); \ + uint8_t __tbits = (__bits); \ + int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ + 3,2,1,0); \ + int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ + 3,2,1,0); \ + int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \ + 34); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif +#endif + _INTRINSATTR static __inline void vst1q_u32(uint32_t *__p32, uint32x4_t __v) @@ -533,4 +624,58 @@ vst1q_u8(uint8_t *__p8, uint8x16_t __v) #endif } +#ifndef __aarch64__ /* XXX */ + +_INTRINSATTR +static __inline uint8x8_t +vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, + (int8x8_t)__idx); +#elif defined(__clang__) + uint8x8_t __ret; +#ifndef __LITTLE_ENDIAN__ + __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); + __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); +#endif + __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, + (int8x8_t)__idx, 16); +#ifndef __LITTLE_ENDIAN__ + __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); +#endif + return __ret; +#endif +} + +_INTRINSATTR +static __inline uint8x8_t +vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) + union { + uint8x8x2_t __u8x8x82; + __builtin_neon_ti __ti; + } __u = { __tab }; + return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); +#elif defined(__clang__) + uint8x8_t __ret; +#ifndef __LITTLE_ENDIAN__ + __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], + 7,6,5,4,3,2,1,0); + __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], + 7,6,5,4,3,2,1,0); + __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); +#endif + __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], + (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); +#ifndef __LITTLE_ENDIAN__ + __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); +#endif + return __ret; +#endif +} + +#endif /* !defined(__aarch64__) */ + #endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */ Index: src/sys/crypto/chacha/arch/arm/arm_neon.h diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.4 --- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.3 Mon Jul 27 20:58:56 2020 +++ src/sys/crypto/chacha/arch/arm/arm_neon.h Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.4 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -43,6 +43,7 @@ typedef __Uint16x8_t uint16x8_t; typedef __Uint32x4_t uint32x4_t; typedef __Uint64x2_t uint64x2_t; typedef __Uint8x16_t uint8x16_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #else typedef __simd128_int32_t int32x4_t; typedef __simd128_int64_t int64x2_t; @@ -56,12 +57,18 @@ typedef __simd64_int8_t int8x8_t; typedef __simd64_uint8_t uint8x8_t; typedef __builtin_neon_udi uint64x1_t; typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #endif -#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) -#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#if defined(__AARCH64EB__) +#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i)) +#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i)) +#elif defined(__ARM_BIG_ENDIAN) +#define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1)) +#define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1)) #else -#define __neon_lane_index(__v, __i) __i +#define __neon_lane_index(__v, __i) (__i) +#define __neon_laneq_index(__v, __i) (__i) #endif #elif defined(__clang__) @@ -79,12 +86,16 @@ typedef __attribute__((neon_vector_type( typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; + typedef struct { uint8x8_t val[2]; } uint8x8x2_t; +typedef struct { uint8x16_t val[2]; } uint8x16x2_t; #ifdef __LITTLE_ENDIAN__ #define __neon_lane_index(__v, __i) __i +#define __neon_laneq_index(__v, __i) __i #else #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) +#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i) #endif #else @@ -168,7 +179,8 @@ _INTRINSATTR static __inline uint8x16_t vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) { -#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) +#ifdef __aarch64__ +#if defined(__AARCH64EB__) return __builtin_shuffle(__hi, __lo, (uint8x16_t) { 16 - __i, 17 - __i, 18 - __i, 19 - __i, @@ -185,6 +197,10 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h __i + 12, __i + 13, __i + 14, __i + 15, }); #endif +#else + return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo, + (int8x16_t)__hi, __i); +#endif } #elif defined(__clang__) #ifdef __LITTLE_ENDIAN__ @@ -222,7 +238,7 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _ #elif defined(__clang__) #define vgetq_lane_u32(__v, __i) \ (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ - __neon_lane_index(__v, __i)) + __neon_laneq_index(__v, __i)) #endif _INTRINSATTR @@ -403,7 +419,7 @@ vrev32q_u16(uint16x8_t __v) #if defined(__GNUC__) && !defined(__clang__) return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); #elif defined(__clang__) - return __builtin_shufflevector(__v, 1,0, 3,2, 5,4, 7,6); + return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6); #endif } @@ -415,7 +431,7 @@ vrev32q_u8(uint8x16_t __v) return __builtin_shuffle(__v, (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); #elif defined(__clang__) - return __builtin_shufflevector(__v, + return __builtin_shufflevector(__v, __v, 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); #endif } @@ -425,13 +441,13 @@ _INTRINSATTR static __inline uint32x4_t vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) { - __v[__neon_lane_index(__v, __i)] = __x; + __v[__neon_laneq_index(__v, __i)] = __x; return __v; } #elif defined(__clang__) #define vsetq_lane_u32(__x, __v, __i) \ (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ - __neon_lane_index(__v, __i)) + __neon_laneq_index(__v, __i)) #endif #if defined(__GNUC__) && !defined(__clang__) @@ -439,13 +455,13 @@ _INTRINSATTR static __inline uint64x2_t vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) { - __v[__neon_lane_index(__v, __i)] = __x; + __v[__neon_laneq_index(__v, __i)] = __x; return __v; } #elif defined(__clang__) #define vsetq_lane_u64(__x, __v, __i) \ - (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ - __neon_lane_index(__v, __i)); + (uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \ + __neon_laneq_index(__v, __i)); #endif #if defined(__GNUC__) && !defined(__clang__) @@ -476,7 +492,7 @@ vshrq_n_u32(uint32x4_t __v, uint8_t __bi #endif } #elif defined(__clang__) -#define vshrq_n_u8(__v, __bits) \ +#define vshrq_n_u32(__v, __bits) \ (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) #endif Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.8 --- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.7 Tue Jul 28 20:08:48 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.c,v 1.7 2020/07/28 20:08:48 riastradh Exp $ */ +/* $NetBSD: chacha_neon.c,v 1.8 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -30,42 +30,18 @@ #include <sys/endian.h> #include "arm_neon.h" +#include "arm_neon_imm.h" #include "chacha_neon.h" -static inline uint32x4_t -vrolq_n_u32(uint32x4_t x, uint8_t n) -{ - - /* - * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in - * practice it hurts performance at least on Cortex-A8. - */ +/* + * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice + * it hurts performance at least on Cortex-A8. + */ #if 1 - return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); +#define vrolq_n_u32(x, n) (vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n))) #else - return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n); +#define vrolq_n_u32(x, n) vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n)) #endif -} - -static inline uint32x4_t -vhtole_u32(uint32x4_t x) -{ -#if _BYTE_ORDER == _LITTLE_ENDIAN - return x; -#elif _BYTE_ORDER == _BIG_ENDIAN - return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); -#endif -} - -static inline uint32x4_t -vletoh_u32(uint32x4_t x) -{ -#if _BYTE_ORDER == _LITTLE_ENDIAN - return x; -#elif _BYTE_ORDER == _BIG_ENDIAN - return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); -#endif -} static inline uint32x4_t rol16(uint32x4_t x) @@ -88,10 +64,10 @@ static inline uint32x4_t rol8(uint32x4_t x) { #if defined(__aarch64__) - static const uint8x16_t rol8_tab = { + static const uint8x16_t rol8_tab = VQ_N_U8( 3, 0, 1, 2, 7, 4, 5, 6, - 11, 8, 9,10, 15,12,13,14, - }; + 11, 8, 9,10, 15,12,13,14 + ); uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); y8 = vqtbl1q_u8(x8, rol8_tab); @@ -106,9 +82,9 @@ rol8(uint32x4_t x) * loops, but it doesn't and so attempting to use VTBL hurts * more than it helps. */ - static const uint8x8_t rol8_tab = { - 3, 0, 1, 2, 7, 4, 5, 6, - }; + static const uint8x8_t rol8_tab = V_N_U8( + 3, 0, 1, 2, 7, 4, 5, 6 + ); uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); @@ -180,17 +156,17 @@ chacha_core_neon(uint8_t out[restrict st uint32x4_t in0, in1, in2, in3; uint32x4_t r0, r1, r2, r3; - r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); - r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); - r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); + r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c)); + r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); + r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in)); chacha_permute(&r0, &r1, &r2, &r3, nr); - vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); - vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); - vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); - vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); + vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0))); + vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1))); + vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2))); + vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3))); } void @@ -202,15 +178,15 @@ hchacha_neon(uint8_t out[restrict static { uint32x4_t r0, r1, r2, r3; - r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); - r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); - r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); + r0 = vreinterpretq_u32_u8(vld1q_u8(c)); + r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); + r3 = vreinterpretq_u32_u8(vld1q_u8(in)); chacha_permute(&r0, &r1, &r2, &r3, nr); - vst1q_u32((uint32_t *)out + 0, r0); - vst1q_u32((uint32_t *)out + 4, r3); + vst1q_u8(out + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(out + 16, vreinterpretq_u8_u32(r3)); } void @@ -225,19 +201,20 @@ chacha_stream_neon(uint8_t *restrict s, chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); if (n) { - const uint32x4_t blkno_inc = {1,0,0,0}; + const uint32x4_t blkno_inc = /* (1,0,0,0) */ + vsetq_lane_u32(1, vdupq_n_u32(0), 0); uint32x4_t in0, in1, in2, in3; uint32x4_t r0, r1, r2, r3; - in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); - in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); - in3 = (uint32x4_t) { + in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); + in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); + in3 = (uint32x4_t) VQ_N_U32( blkno, le32dec(nonce), le32dec(nonce + 4), le32dec(nonce + 8) - }; + ); for (; n; s += 64, n -= 64) { r0 = in0; @@ -245,27 +222,27 @@ chacha_stream_neon(uint8_t *restrict s, r2 = in2; r3 = in3; chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = vhtole_u32(vaddq_u32(r0, in0)); - r1 = vhtole_u32(vaddq_u32(r1, in1)); - r2 = vhtole_u32(vaddq_u32(r2, in2)); - r3 = vhtole_u32(vaddq_u32(r3, in3)); + r0 = vaddq_u32(r0, in0); + r1 = vaddq_u32(r1, in1); + r2 = vaddq_u32(r2, in2); + r3 = vaddq_u32(r3, in3); if (n < 64) { uint8_t buf[64] __aligned(16); - vst1q_u32((uint32_t *)buf + 4*0, r0); - vst1q_u32((uint32_t *)buf + 4*1, r1); - vst1q_u32((uint32_t *)buf + 4*2, r2); - vst1q_u32((uint32_t *)buf + 4*3, r3); + vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); memcpy(s, buf, n); break; } - vst1q_u32((uint32_t *)s + 4*0, r0); - vst1q_u32((uint32_t *)s + 4*1, r1); - vst1q_u32((uint32_t *)s + 4*2, r2); - vst1q_u32((uint32_t *)s + 4*3, r3); + vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); in3 = vaddq_u32(in3, blkno_inc); } } @@ -284,19 +261,20 @@ chacha_stream_xor_neon(uint8_t *s, const chacha_const32, nr); if (n) { - const uint32x4_t blkno_inc = {1,0,0,0}; + const uint32x4_t blkno_inc = /* (1,0,0,0) */ + vsetq_lane_u32(1, vdupq_n_u32(0), 0); uint32x4_t in0, in1, in2, in3; uint32x4_t r0, r1, r2, r3; - in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); - in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); - in3 = (uint32x4_t) { + in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); + in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); + in3 = (uint32x4_t) VQ_N_U32( blkno, le32dec(nonce), le32dec(nonce + 4), le32dec(nonce + 8) - }; + ); for (; n; s += 64, p += 64, n -= 64) { r0 = in0; @@ -304,19 +282,19 @@ chacha_stream_xor_neon(uint8_t *s, const r2 = in2; r3 = in3; chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = vhtole_u32(vaddq_u32(r0, in0)); - r1 = vhtole_u32(vaddq_u32(r1, in1)); - r2 = vhtole_u32(vaddq_u32(r2, in2)); - r3 = vhtole_u32(vaddq_u32(r3, in3)); + r0 = vaddq_u32(r0, in0); + r1 = vaddq_u32(r1, in1); + r2 = vaddq_u32(r2, in2); + r3 = vaddq_u32(r3, in3); if (n < 64) { uint8_t buf[64] __aligned(16); unsigned i; - vst1q_u32((uint32_t *)buf + 4*0, r0); - vst1q_u32((uint32_t *)buf + 4*1, r1); - vst1q_u32((uint32_t *)buf + 4*2, r2); - vst1q_u32((uint32_t *)buf + 4*3, r3); + vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); for (i = 0; i < n - n%4; i += 4) le32enc(s + i, @@ -327,14 +305,14 @@ chacha_stream_xor_neon(uint8_t *s, const break; } - r0 ^= vld1q_u32((const uint32_t *)p + 4*0); - r1 ^= vld1q_u32((const uint32_t *)p + 4*1); - r2 ^= vld1q_u32((const uint32_t *)p + 4*2); - r3 ^= vld1q_u32((const uint32_t *)p + 4*3); - vst1q_u32((uint32_t *)s + 4*0, r0); - vst1q_u32((uint32_t *)s + 4*1, r1); - vst1q_u32((uint32_t *)s + 4*2, r2); - vst1q_u32((uint32_t *)s + 4*3, r3); + r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0)); + r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16)); + r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32)); + r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48)); + vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); in3 = vaddq_u32(in3, blkno_inc); } } Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.3 --- src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.2 Wed Jul 29 14:23:59 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $ */ +/* $NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include <machine/asm.h> -RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $") +RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $") .fpu neon @@ -54,7 +54,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 */ .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 - vld1.32 {\c2-\c3}, [fp, :256] + vld1.8 {\c2-\c3}, [fp, :256] .endm .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h @@ -80,7 +80,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vadd.u32 \c2, \c2, \d2 vadd.u32 \c3, \c3, \d3 - vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ + vst1.8 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ veor \c0, \b0, \c0 veor \c1, \b1, \c1 @@ -118,7 +118,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vtbl.8 \d3l, {\d3l}, \c0l vtbl.8 \d3h, {\d3h}, \c0l - vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ + vld1.8 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ /* c += d; b ^= c; b <<<= 7 */ vadd.u32 \c2, \c2, \d2 @@ -126,7 +126,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vadd.u32 \c0, \c0, \d0 vadd.u32 \c1, \c1, \d1 - vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ + vst1.8 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ veor \c2, \b2, \c2 veor \c3, \b3, \c3 @@ -143,14 +143,6 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vsri.u32 \b1, \c3, #(32 - 7) .endm -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define HTOLE32(x) -#define LE32TOH(x) -#elif _BYTE_ORDER == _BIG_ENDIAN -#define HTOLE32(x) vrev32.8 x, x -#define LE32TOH(x) vrev32.8 x, x -#endif - .text .p2align 2 .Lconstants_addr: @@ -183,10 +175,16 @@ ENTRY(chacha_stream256_neon) ldm ip, {r4, r5} /* r4 := const, r5 := nr */ ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ - vld1.32 {q12}, [r4] /* q12 := constant */ - vld1.32 {q13-q14}, [r3] /* q13-q14 := key */ + vld1.8 {q12}, [r4] /* q12 := constant */ + vld1.8 {q13-q14}, [r3] /* q13-q14 := key */ vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ +#ifdef __ARM_BIG_ENDIAN + rev r6, r6 + rev r8, r8 + rev r10, r10 +#endif + vdup.32 q0, d24[0] /* q0-q3 := constant */ vdup.32 q1, d24[1] vdup.32 q2, d25[0] @@ -205,23 +203,6 @@ ENTRY(chacha_stream256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) - b 2f _ALIGN_TEXT @@ -283,9 +264,9 @@ ENTRY(chacha_stream256_neon) vzip.32 q6, q7 vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q9}, [r4] /* q9 := constant */ + vld1.8 {q9}, [r4] /* q9 := constant */ vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q8}, [r3]! /* q8 := key[0:16) */ + vld1.8 {q8}, [r3]! /* q8 := key[0:16) */ vswp d1, d4 vswp d9, d12 @@ -330,19 +311,10 @@ ENTRY(chacha_stream256_neon) vadd.u32 q3, q3, q8 vadd.u32 q7, q7, q8 - vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ - - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q3) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q6) - LE32TOH(q7) + vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */ - vst1.32 {q0-q1}, [r0]! - vld1.32 {q0}, [r3] /* q0 := key[16:32) */ + vst1.8 {q0-q1}, [r0]! + vld1.8 {q0}, [r3] /* q0 := key[16:32) */ mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ vmov d2, r3, r6 vmov d3, r8, r10 @@ -370,23 +342,14 @@ ENTRY(chacha_stream256_neon) vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) - - /* vst1.32 {q0-q1}, [r0]! */ - vst1.32 {q8-q9}, [r0]! - vst1.32 {q2-q3}, [r0]! - vst1.32 {q10-q11}, [r0]! - vst1.32 {q4-q5}, [r0]! - vst1.32 {q12-q13}, [r0]! - vst1.32 {q6-q7}, [r0]! - vst1.32 {q14-q15}, [r0] + /* vst1.8 {q0-q1}, [r0]! */ + vst1.8 {q8-q9}, [r0]! + vst1.8 {q2-q3}, [r0]! + vst1.8 {q10-q11}, [r0]! + vst1.8 {q4-q5}, [r0]! + vst1.8 {q12-q13}, [r0]! + vst1.8 {q6-q7}, [r0]! + vst1.8 {q14-q15}, [r0] /* zero temporary space on the stack */ vmov.i32 q0, #0 @@ -426,10 +389,16 @@ ENTRY(chacha_stream_xor256_neon) ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ - vld1.32 {q12}, [r5] /* q12 := constant */ - vld1.32 {q13-q14}, [r4] /* q13-q14 := key */ + vld1.8 {q12}, [r5] /* q12 := constant */ + vld1.8 {q13-q14}, [r4] /* q13-q14 := key */ vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ +#ifdef __ARM_BIG_ENDIAN + rev r6, r6 + rev r8, r8 + rev r10, r10 +#endif + vdup.32 q0, d24[0] /* q0-q3 := constant */ vdup.32 q1, d24[1] vdup.32 q2, d25[0] @@ -448,23 +417,6 @@ ENTRY(chacha_stream_xor256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) - b 2f _ALIGN_TEXT @@ -496,9 +448,9 @@ ENTRY(chacha_stream_xor256_neon) vzip.32 q6, q7 vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q9}, [r5] /* q9 := constant */ + vld1.8 {q9}, [r5] /* q9 := constant */ vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ + vld1.8 {q8}, [r4]! /* q8 := key[0:16) */ vswp d3, d6 vswp d9, d12 @@ -518,24 +470,15 @@ ENTRY(chacha_stream_xor256_neon) vadd.u32 q3, q3, q8 vadd.u32 q7, q7, q8 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ - - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q6) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q3) - LE32TOH(q7) + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ veor q1, q1, q9 - vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ + vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */ - vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ - vld1.32 {q0}, [r4] /* q0 := key[16:32) */ + vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ + vld1.8 {q0}, [r4] /* q0 := key[16:32) */ mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ vmov d2, r3, r6 vmov d3, r8, r10 @@ -563,57 +506,48 @@ ENTRY(chacha_stream_xor256_neon) vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ - - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ veor q1, q1, q9 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ - vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ - vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ + vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ + vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ veor q3, q3, q9 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ - vst1.32 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ + vst1.8 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ veor q11, q11, q1 - vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ - vst1.32 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ + vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ + vst1.8 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ veor q5, q5, q9 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ - vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ + vst1.8 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ veor q13, q13, q1 - vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ - vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ + vld1.8 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ + vst1.8 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ veor q6, q6, q8 /* compute ciphertext bytes [192:224) */ veor q7, q7, q9 - vst1.32 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ + vst1.8 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ veor q14, q14, q0 /* compute ciphertext bytes [224:256) */ veor q15, q15, q1 - vst1.32 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ + vst1.8 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ /* zero temporary space on the stack */ vmov.i32 q0, #0 @@ -637,5 +571,5 @@ END(v0123) .type rot8,%object rot8: - .long 0x02010003, 0x06050407 + .byte 3,0,1,2, 7,4,5,6 END(rot8) Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.5 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.6 --- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.5 Tue Jul 28 15:42:41 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Sat Aug 8 14:47:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $ */ +/* $NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include <aarch64/asm.h> -RCSID("$NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $") +RCSID("$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $") #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ @@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -234,22 +234,22 @@ ENTRY(chacha_stream256_neon) add v14.4s, v14.4s, v30.4s add v15.4s, v15.4s, v31.4s - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 @@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -401,22 +401,22 @@ ENTRY(chacha_stream_xor256_neon) ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b Added files: Index: src/sys/crypto/aes/arch/arm/arm_neon_imm.h diff -u /dev/null src/sys/crypto/aes/arch/arm/arm_neon_imm.h:1.1 --- /dev/null Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon_imm.h Sat Aug 8 14:47:01 2020 @@ -0,0 +1,80 @@ +/* $NetBSD: arm_neon_imm.h,v 1.1 2020/08/08 14:47:01 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H +#define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H + +/* + * Non-standard macros for writing ARM NEON vector literals. Needed + * because apparently GCC and Clang disagree wildly on the ordering of + * vector literal components -- and both disagree with the + * architectural indexing! + */ + +#if defined(__GNUC__) && !defined(__clang__) +#if defined(__AARCH64EB__) +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a} +#define VQ_N_U32(a,b,c,d) \ + {d,c, b,a} +#elif defined(__ARM_BIG_ENDIAN) +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {h,g,f,e,d,c,b,a, p,o,n,m,l,k,j,i} +#define VQ_N_U32(a,b,c,d) \ + {b,a, d,c} +#else +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {a,b,c,d,e,f,g,h} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p} +#define VQ_N_U32(a,b,c,d) \ + {a,b, c,d} +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {a,b,c,d,e,f,g,h} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p} +#define VQ_N_U32(a,b,c,d) \ + {a,b, c,d} +#else +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {h,g,f,e,d,c,b,a, p,o,n,m,l,k,i,j} +#define VQ_N_U32(a,b,c,d) \ + {d,c, b,a} +#endif +#endif + +#endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H */ Index: src/sys/crypto/chacha/arch/arm/arm_neon_imm.h diff -u /dev/null src/sys/crypto/chacha/arch/arm/arm_neon_imm.h:1.1 --- /dev/null Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/chacha/arch/arm/arm_neon_imm.h Sat Aug 8 14:47:01 2020 @@ -0,0 +1,80 @@ +/* $NetBSD: arm_neon_imm.h,v 1.1 2020/08/08 14:47:01 riastradh Exp $ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H +#define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H + +/* + * Non-standard macros for writing ARM NEON vector literals. Needed + * because apparently GCC and Clang disagree wildly on the ordering of + * vector literal components -- and both disagree with the + * architectural indexing! + */ + +#if defined(__GNUC__) && !defined(__clang__) +#if defined(__AARCH64EB__) +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a} +#define VQ_N_U32(a,b,c,d) \ + {d,c, b,a} +#elif defined(__ARM_BIG_ENDIAN) +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {h,g,f,e,d,c,b,a, p,o,n,m,l,k,j,i} +#define VQ_N_U32(a,b,c,d) \ + {b,a, d,c} +#else +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {a,b,c,d,e,f,g,h} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p} +#define VQ_N_U32(a,b,c,d) \ + {a,b, c,d} +#endif +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {a,b,c,d,e,f,g,h} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p} +#define VQ_N_U32(a,b,c,d) \ + {a,b, c,d} +#else +#define V_N_U8(a,b,c,d,e,f,g,h) \ + {h,g,f,e,d,c,b,a} +#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \ + {h,g,f,e,d,c,b,a, p,o,n,m,l,k,i,j} +#define VQ_N_U32(a,b,c,d) \ + {d,c, b,a} +#endif +#endif + +#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H */