Module Name:    src
Committed By:   riastradh
Date:           Sat Aug  8 14:47:01 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_armv8_64.S aes_neon.c aes_neon_32.S
            aes_neon_impl.h aes_neon_subr.c arm_neon.h
        src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c
            chacha_neon_32.S chacha_neon_64.S
Added Files:
        src/sys/crypto/aes/arch/arm: arm_neon_imm.h
        src/sys/crypto/chacha/arch/arm: arm_neon_imm.h

Log Message:
Fix ARM NEON implementations of AES and ChaCha on big-endian ARM.

New macros such as VQ_N_U32(a,b,c,d) for NEON vector initializers.
Needed because GCC and Clang disagree on the ordering of lanes,
depending on whether it's 64-bit big-endian, 32-bit big-endian, or
little-endian -- and, bizarrely, both of them disagree with the
architectural numbering of lanes.

Experimented with using

static const uint8_t x8[16] = {...};

        uint8x16_t x = vld1q_u8(x8);

which doesn't require knowing anything about the ordering of lanes,
but this generates considerably worse code and apparently confuses
GCC into not recognizing the constant value of x8.

Fix some clang mistakes while here too.


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 src/sys/crypto/aes/arch/arm/aes_armv8_64.S
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_neon.c \
    src/sys/crypto/aes/arch/arm/aes_neon_32.S \
    src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon_impl.h
cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/arm_neon.h
cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/arm_neon_imm.h
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/arm_neon.h
cvs rdiff -u -r0 -r1.1 src/sys/crypto/chacha/arch/arm/arm_neon_imm.h
cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/chacha/arch/arm/chacha_neon.c
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.11 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.12
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.11	Mon Jul 27 20:57:23 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -26,11 +26,9 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <sys/endian.h>
-
 #include <aarch64/asm.h>
 
-RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $")
+RCSID("$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $")
 
 	.arch_extension	aes
 
@@ -921,19 +919,13 @@ ENTRY(aesarmv8_ccm_enc1)
 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
-#endif
 	_ALIGN_TEXT
 1:	ldr	q3, [x1], #0x10		/* q3 := plaintext block */
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
-#else
-	mov	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
-#endif
 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
 					 * trash x0/x3/q16 */
@@ -941,9 +933,7 @@ ENTRY(aesarmv8_ccm_enc1)
 	subs	x10, x10, #0x10		/* count down bytes */
 	str	q3, [x2], #0x10		/* store ciphertext block */
 	b.ne	1b			/* repeat if more blocks */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
-#endif
 	stp	q0, q2, [x4]		/* store updated auth/ctr */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
@@ -968,18 +958,12 @@ ENTRY(aesarmv8_ccm_dec1)
 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
-#endif
 
 	/* Decrypt the first block.  */
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x3, x5			/* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#else
-	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#endif
 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
 	b	2f
@@ -995,11 +979,7 @@ ENTRY(aesarmv8_ccm_dec1)
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#else
-	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#endif
 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
 					 * trash x0/x3/q16 */
@@ -1009,15 +989,14 @@ ENTRY(aesarmv8_ccm_dec1)
 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
 	b.ne	1b
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
-#endif
 
 	/* Authenticate the last block.  */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
+
 	stp	q0, q2, [x4]		/* store updated auth/ctr */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret

Index: src/sys/crypto/aes/arch/arm/aes_neon.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.4 src/sys/crypto/aes/arch/arm/aes_neon.c:1.5
--- src/sys/crypto/aes/arch/arm/aes_neon.c:1.4	Tue Jul 28 20:11:09 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.c	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
 
 #include <sys/types.h>
 
@@ -60,141 +60,141 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v
 
 static const uint8x16_t
 mc_forward[4] = {
-	{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
-	 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
-	{0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
-	 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
-	{0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
-	 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
-	{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
-	 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
+	VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
+	    0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
+	VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
+	    0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
+	VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
+	    0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
+	VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
+	    0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
 },
 mc_backward[4] __aarch64_used = {
-	{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
-	 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
-	{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
-	 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
-	{0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
-	 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
-	{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
-	 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
+	VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
+	    0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
+	VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
+	    0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
+	VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
+	    0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
+	VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
+	    0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
 },
 ipt[2] __aarch64_used = {
-	{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
-	 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
-	{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
-	 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
+	VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
+	    0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
+	VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
+	    0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
 },
 opt[2] = {
-	{0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
-	 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
-	{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
-	 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
+	VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
+	    0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
+	VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
+	    0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
 },
 dipt[2] __aarch64_used = {
-	{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
-	 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
-	{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
-	 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
+	VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
+	    0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
+	VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
+	    0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
 },
 sb1[2] __aarch64_used = {
-	{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
-	 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
-	{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
-	 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
+	VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
+	    0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
+	VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
+	    0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
 },
 sb2[2] __aarch64_used = {
-	{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
-	 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
-	{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
-	 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
+	VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
+	    0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
+	VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
+	    0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
 },
 sbo[2] __aarch64_used = {
-	{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
-	 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
-	{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
-	 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
+	VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
+	    0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
+	VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
+	    0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
 },
 dsb9[2] __aarch64_used = {
-	{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
-	 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
-	{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
-	 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
+	VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
+	    0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
+	VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
+	    0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
 },
 dsbd[2] __aarch64_used = {
-	{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
-	 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
-	{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
-	 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
+	VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
+	    0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
+	VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
+	    0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
 },
 dsbb[2] __aarch64_used = {
-	{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
-	 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
-	{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
-	 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
+	VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
+	    0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
+	VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
+	    0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
 },
 dsbe[2] __aarch64_used = {
-	{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
-	 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
-	{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
-	 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
+	VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
+	    0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
+	VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
+	    0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
 },
 dsbo[2] __aarch64_used = {
-	{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
-	 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
-	{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
-	 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
+	VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
+	    0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
+	VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
+	    0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
 },
 dks1[2] = {
-	{0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
-	 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
-	{0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
-	 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
+	VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
+	    0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
+	VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
+	    0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
 },
 dks2[2] = {
-	{0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
-	 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
-	{0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
-	 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
+	VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
+	    0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
+	VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
+	    0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
 },
 dks3[2] = {
-	{0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
-	 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
-	{0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
-	 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
+	VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
+	    0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
+	VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
+	    0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
 },
 dks4[2] = {
-	{0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
-	 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
-	{0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
-	 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F},
+	VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
+	    0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0),
+	VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
+	    0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F),
 },
 deskew[2] = {
-	{0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
-	 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D},
-	{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
-	 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
+	VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
+	    0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D),
+	VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
+	    0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28),
 },
 sr[4] __aarch64_used = {
-	{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
-	 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
-	{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
-	 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B},
-	{0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
-	 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07},
-	{0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
-	 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03},
-},
-rcon =	{0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
-	0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70},
-s63 =	{0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
-	0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B},
-of =	{0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
-	0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
-inv =	{0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
-	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04},
-inva =	{0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
-	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03};
+	VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+	    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F),
+	VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
+	    0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B),
+	VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
+	    0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07),
+	VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
+	    0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03),
+},
+rcon	= VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
+	    0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70),
+of	= VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
+	    0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F),
+s63	= VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
+	    0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B),
+inv	= VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
+	    0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04),
+inva	= VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
+	    0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
 
 static inline uint8x16_t
 loadroundkey(const void *rkp)
Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.4 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.5
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.4	Mon Jul 27 20:57:23 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include <arm/asm.h>
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $")
 
 	.fpu	neon
 
@@ -270,7 +270,7 @@ ENTRY(aes_neon_enc1)
 	ldr	r12, .Lconstants_addr
 	adr	r11, .Lconstants_addr
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 	movw	r3, #0
 	vmov.i8	q1, #0x0f
 
@@ -280,8 +280,8 @@ ENTRY(aes_neon_enc1)
 	/* (q4, q5) := (iptlo, ipthi) */
 	add	r6, r12, #(iptlo - .Lconstants)
 	add	r7, r12, #(ipthi - .Lconstants)
-	vld1.64	{d8-d9}, [r6 :128]
-	vld1.64	{d10-d11}, [r7 :128]
+	vld1.8	{d8-d9}, [r6 :128]
+	vld1.8	{d10-d11}, [r7 :128]
 
 	/* load the rest of the constants */
 	add	r4, r12, #(sb1_0 - .Lconstants)
@@ -290,12 +290,12 @@ ENTRY(aes_neon_enc1)
 	add	r7, r12, #(sb2_1 - .Lconstants)
 	add	r8, r12, #(inv - .Lconstants)
 	add	r10, r12, #(inva - .Lconstants)
-	vld1.64	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
-	vld1.64	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
-	vld1.64	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
-	vld1.64	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
-	vld1.64	{d20-d21}, [r8 :128]	/* q10 = inv */
-	vld1.64	{d22-d23}, [r10 :128]	/* q11 = inva */
+	vld1.8	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
+	vld1.8	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
+	vld1.8	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
+	vld1.8	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
+	vld1.8	{d20-d21}, [r8 :128]	/* q10 = inv */
+	vld1.8	{d22-d23}, [r10 :128]	/* q11 = inva */
 
 	/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
 	add	r4, r12, #(mc_forward - .Lconstants)
@@ -319,7 +319,7 @@ ENTRY(aes_neon_enc1)
 	b	2f
 
 	_ALIGN_TEXT
-1:	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+1:	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
 	vtbl.8	d24, {d12-d13}, d4
@@ -339,8 +339,8 @@ ENTRY(aes_neon_enc1)
 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
 	add	r6, r4, r3, lsl #4
 	add	r7, r5, r3, lsl #4
-	vld1.64	{d24-d25}, [r6]
-	vld1.64	{d26-d27}, [r7]
+	vld1.8	{d24-d25}, [r6]
+	vld1.8	{d26-d27}, [r7]
 
 	/* q15 := A2_B = A2 + A(mcf) */
 	vtbl.8	d30, {d0-d1}, d24
@@ -412,11 +412,11 @@ ENTRY(aes_neon_enc1)
 	add	r6, r12, #(sbo_0 - .Lconstants)
 	add	r7, r12, #(sbo_1 - .Lconstants)
 	add	r8, r8, r3, lsl #4
-	vld1.64	{d12-d13}, [r6 :128]
-	vld1.64	{d14-d15}, [r7 :128]
-	vld1.64	{d30-d31}, [r8 :128]
+	vld1.8	{d12-d13}, [r6 :128]
+	vld1.8	{d14-d15}, [r7 :128]
+	vld1.8	{d30-d31}, [r8 :128]
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
 	vtbl.8	d4, {d12-d13}, d4
@@ -489,7 +489,7 @@ ENTRY(aes_neon_dec1)
 	ldr	r12, .Lconstants_addr
 	adr	r11, .Lconstants_addr
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
 	vmov.i8	q1, #0x0f
 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
@@ -500,8 +500,8 @@ ENTRY(aes_neon_dec1)
 	/* (q4, q5) := (diptlo, dipthi) */
 	add	r6, r12, #(diptlo - .Lconstants)
 	add	r7, r12, #(dipthi - .Lconstants)
-	vld1.64	{d8-d9}, [r6 :128]
-	vld1.64	{d10-d11}, [r7 :128]
+	vld1.8	{d8-d9}, [r6 :128]
+	vld1.8	{d10-d11}, [r7 :128]
 
 	/* load the rest of the constants */
 	add	r4, r12, #(dsbb_0 - .Lconstants)
@@ -509,11 +509,11 @@ ENTRY(aes_neon_dec1)
 	add	r6, r12, #(inv - .Lconstants)
 	add	r7, r12, #(inva - .Lconstants)
 	add	r8, r12, #(.Lmc_forward_3 - .Lconstants)
-	vld1.64	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
-	vld1.64	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
-	vld1.64	{d20-d21}, [r6 :128]	/* q10 := inv */
-	vld1.64	{d22-d23}, [r7 :128]	/* q11 := inva */
-	vld1.64	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
+	vld1.8	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
+	vld1.8	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
+	vld1.8	{d20-d21}, [r6 :128]	/* q10 := inv */
+	vld1.8	{d22-d23}, [r7 :128]	/* q11 := inva */
+	vld1.8	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4
@@ -529,8 +529,8 @@ ENTRY(aes_neon_dec1)
 	/* load dsb9 */
 	add	r4, r12, #(dsb9_0 - .Lconstants)
 	add	r5, r12, #(dsb9_1 - .Lconstants)
-	vld1.64	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
-	vld1.64	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
+	vld1.8	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
+	vld1.8	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
 
 	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
 	veor	q0, q14, q2
@@ -541,10 +541,10 @@ ENTRY(aes_neon_dec1)
 	_ALIGN_TEXT
 1:	/* load dsbd */
 	add	r4, r12, #(dsbd_0 - .Lconstants)
-	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
-	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
+	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
+	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
 	vtbl.8	d24, {d8-d9}, d4
@@ -568,8 +568,8 @@ ENTRY(aes_neon_dec1)
 
 	/* load dsbe */
 	add	r4, r12, #(dsbe_0 - .Lconstants)
-	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
-	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
+	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
+	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
 
 	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
 	vtbl.8	d28, {d0-d1}, d30
@@ -647,11 +647,11 @@ ENTRY(aes_neon_dec1)
 	add	r6, r12, #(dsbo_0 - .Lconstants)
 	add	r7, r12, #(dsbo_1 - .Lconstants)
 	add	r8, r8, r3, lsl #4
-	vld1.64	{d12-d13}, [r6 :128]
-	vld1.64	{d14-d15}, [r7 :128]
-	vld1.64	{d30-d31}, [r8 :128]
+	vld1.8	{d12-d13}, [r6 :128]
+	vld1.8	{d14-d15}, [r7 :128]
+	vld1.8	{d30-d31}, [r8 :128]
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
 	vtbl.8	d4, {d12-d13}, d4
Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.4 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.5
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.4	Tue Jul 28 20:11:09 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,9 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
-
-#include <sys/endian.h>
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
 
 #ifdef _KERNEL
 #include <sys/systm.h>
@@ -38,6 +36,7 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon_sub
 #include <assert.h>
 #include <inttypes.h>
 #include <stdio.h>
+#include <string.h>
 #define	KASSERT			assert
 #endif
 
@@ -144,7 +143,8 @@ static inline uint8x16_t
 aes_neon_xts_update(uint8x16_t t8)
 {
 	const int32x4_t zero = vdupq_n_s32(0);
-	const int32x4_t carry = {0x87, 1, 1, 1};
+	/* (0x87,1,1,1) */
+	const uint32x4_t carry = vsetq_lane_u32(0x87, vdupq_n_u32(1), 0);
 	int32x4_t t, t_;
 	uint32x4_t mask;
 
@@ -161,32 +161,36 @@ static int
 aes_neon_xts_update_selftest(void)
 {
 	static const struct {
-		uint32_t in[4], out[4];
+		uint8_t in[16], out[16];
 	} cases[] = {
-		[0] = { {1}, {2} },
-		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
-		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
-		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
-		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
-		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
+		[0] = { {1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0},
+			{2,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} },
+		[1] = { {0,0,0,0x80, 0,0,0,0, 0,0,0,0, 0,0,0,0},
+			{0,0,0,0, 1,0,0,0, 0,0,0,0, 0,0,0,0} },
+		[2] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0},
+			{0,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} },
+		[3] = { {0,0,0,0, 0,0,0,0, 0,0,0,0x80, 0,0,0,0},
+			{0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0} },
+		[4] = { {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80},
+			{0x87,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} },
+		[5] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0x80},
+			{0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} },
 	};
 	unsigned i;
-	uint32_t t[4];
+	uint8_t t[16];
 	int result = 0;
 
 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
-		t[0] = cases[i].in[0];
-		t[1] = cases[i].in[1];
-		t[2] = cases[i].in[2];
-		t[3] = cases[i].in[3];
-		storeblock(t, aes_neon_xts_update(loadblock(t)));
-		if (t[0] != cases[i].out[0] ||
-		    t[1] != cases[i].out[1] ||
-		    t[2] != cases[i].out[2] ||
-		    t[3] != cases[i].out[3]) {
-			printf("%s %u:"
-			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
-			    __func__, i, t[0], t[1], t[2], t[3]);
+		storeblock(t, aes_neon_xts_update(loadblock(cases[i].in)));
+		if (memcmp(t, cases[i].out, 16)) {
+			char buf[33];
+			unsigned j;
+
+			for (j = 0; j < 16; j++) {
+				snprintf(buf + 2*j, sizeof(buf) - 2*j,
+				    " %02hhx", t[j]);
+			}
+			printf("%s %u: %s\n", __func__, i, buf);
 			result = -1;
 		}
 	}
@@ -289,22 +293,13 @@ aes_neon_cbcmac_update1(const struct aes
  * function, which should substantially improve CCM throughput.
  */
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-#define	vbetoh32q_u8	vrev32q_u8
-#define	vhtobe32q_u8	vrev32q_u8
-#elif _BYTE_ORDER == _BIG_ENDIAN
-#define	vbetoh32q_u8(x)	(x)
-#define	vhtobe32q_u8(x)	(x)
-#else
-#error what kind of endian are you anyway
-#endif
-
 void
 aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
     uint32_t nrounds)
 {
-	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
+	/* (0,0,0,1) */
+	const uint32x4_t ctr32_inc = vsetq_lane_u32(1, vdupq_n_u32(0), 3);
 	uint8x16_t auth, ptxt, ctr_be;
 	uint32x4_t ctr;
 
@@ -313,12 +308,12 @@ aes_neon_ccm_enc1(const struct aesenc *e
 
 	auth = loadblock(authctr);
 	ctr_be = loadblock(authctr + 16);
-	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be));
 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 		uint8x16x2_t b2;
 		ptxt = loadblock(in);
 		ctr = vaddq_u32(ctr, ctr32_inc);
-		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr));
 
 		b2.val[0] = auth ^ ptxt;
 		b2.val[1] = ctr_be;
@@ -335,7 +330,8 @@ aes_neon_ccm_dec1(const struct aesenc *e
     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
     uint32_t nrounds)
 {
-	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
+	/* (0,0,0,1) */
+	const uint32x4_t ctr32_inc = vsetq_lane_u32(1, vdupq_n_u32(0), 3);
 	uint8x16_t auth, ctr_be, ptxt, pad;
 	uint32x4_t ctr;
 
@@ -343,9 +339,9 @@ aes_neon_ccm_dec1(const struct aesenc *e
 	KASSERT(nbytes % 16 == 0);
 
 	ctr_be = loadblock(authctr + 16);
-	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be));
 	ctr = vaddq_u32(ctr, ctr32_inc);
-	ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+	ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr));
 	pad = aes_neon_enc1(enc, ctr_be, nrounds);
 	auth = loadblock(authctr);
 	for (;; in += 16, out += 16) {
@@ -359,7 +355,7 @@ aes_neon_ccm_dec1(const struct aesenc *e
 			break;
 
 		ctr = vaddq_u32(ctr, ctr32_inc);
-		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr));
 		b2.val[0] = auth;
 		b2.val[1] = ctr_be;
 		b2 = aes_neon_enc2(enc, b2, nrounds);

Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.h
diff -u src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.3
--- src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.2	Tue Jul 28 20:11:09 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_impl.h,v 1.3 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -32,6 +32,7 @@
 #include <sys/types.h>
 
 #include "arm_neon.h"
+#include "arm_neon_imm.h"
 
 #include <crypto/aes/aes.h>
 #include <crypto/aes/arch/arm/aes_neon.h>

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.7 src/sys/crypto/aes/arch/arm/arm_neon.h:1.8
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.7	Tue Jul 28 20:11:09 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.8 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,6 +39,7 @@
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
 typedef __Int8x16_t int8x16_t;
+typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
@@ -47,6 +48,7 @@ typedef struct { uint8x16_t val[2]; } ui
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 typedef __simd128_int8_t int8x16_t;
+typedef __simd128_uint16_t uint16x8_t;
 typedef __simd128_uint32_t uint32x4_t;
 typedef __simd128_uint64_t uint64x2_t;
 typedef __simd128_uint8_t uint8x16_t;
@@ -58,10 +60,15 @@ typedef struct { uint8x8_t val[2]; } uin
 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #endif
 
-#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
-#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#if defined(__AARCH64EB__)
+#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
+#define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
+#elif defined(__ARM_BIG_ENDIAN)
+#define	__neon_lane_index(__v, __i)	((__i) ^ (__arraycount(__v) - 1))
+#define	__neon_laneq_index(__v, __i)	((__i) ^ (__arraycount(__v)/2 - 1))
 #else
-#define	__neon_lane_index(__v, __i)	__i
+#define	__neon_lane_index(__v, __i)	(__i)
+#define	__neon_laneq_index(__v, __i)	(__i)
 #endif
 
 #elif defined(__clang__)
@@ -72,17 +79,23 @@ typedef struct { uint8x16_t val[2]; } ui
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 
 #ifdef __LITTLE_ENDIAN__
 #define	__neon_lane_index(__v, __i)	__i
+#define	__neon_laneq_index(__v, __i)	__i
 #else
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #endif
 
 #else
@@ -166,7 +179,8 @@ _INTRINSATTR
 static __inline uint8x16_t
 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
 {
-#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+#ifdef __aarch64__
+#if defined(__AARCH64EB__)
 	return __builtin_shuffle(__hi, __lo,
 	    (uint8x16_t) {
 		16 - __i, 17 - __i, 18 - __i, 19 - __i,
@@ -183,6 +197,10 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h
 		__i + 12, __i + 13, __i + 14, __i + 15,
 	});
 #endif
+#else
+	return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo,
+	    (int8x16_t)__hi, __i);
+#endif
 }
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
@@ -220,7 +238,7 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _
 #elif defined(__clang__)
 #define	vgetq_lane_u32(__v, __i)					      \
 	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
-	    __neon_lane_index(__v, __i))
+	    __neon_laneq_index(__v, __i))
 #endif
 
 _INTRINSATTR
@@ -332,6 +350,27 @@ vreinterpretq_s32_u8(uint8x16_t __v)
 }
 
 _INTRINSATTR
+static __inline uint16x8_t
+vreinterpretq_u16_u32(uint32x4_t __v)
+{
+	return (uint16x8_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u16(uint16x8_t __v)
+{
+	return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u64(uint64x2_t __v)
+{
+	return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u8(uint8x16_t __v)
 {
@@ -340,6 +379,13 @@ vreinterpretq_u32_u8(uint8x16_t __v)
 
 _INTRINSATTR
 static __inline uint64x2_t
+vreinterpretq_u64_u32(uint32x4_t __v)
+{
+	return (uint64x2_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint64x2_t
 vreinterpretq_u64_u8(uint8x16_t __v)
 {
 	return (uint64x2_t)__v;
@@ -367,6 +413,17 @@ vreinterpretq_u8_u64(uint64x2_t __v)
 }
 
 _INTRINSATTR
+static __inline uint16x8_t
+vrev32q_u16(uint16x8_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
+#elif defined(__clang__)
+	return __builtin_shufflevector(__v, __v,  1,0, 3,2, 5,4, 7,6);
+#endif
+}
+
+_INTRINSATTR
 static __inline uint8x16_t
 vrev32q_u8(uint8x16_t __v)
 {
@@ -374,7 +431,7 @@ vrev32q_u8(uint8x16_t __v)
 	return __builtin_shuffle(__v,
 	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
 #elif defined(__clang__)
-	return __builtin_shufflevector(__v,
+	return __builtin_shufflevector(__v, __v,
 	    3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
 #endif
 }
@@ -384,13 +441,13 @@ _INTRINSATTR
 static __inline uint32x4_t
 vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
 {
-	__v[__neon_lane_index(__v, __i)] = __x;
+	__v[__neon_laneq_index(__v, __i)] = __x;
 	return __v;
 }
 #elif defined(__clang__)
 #define	vsetq_lane_u32(__x, __v, __i)					      \
 	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
-	    __neon_lane_index(__v, __i))
+	    __neon_laneq_index(__v, __i))
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)
@@ -398,13 +455,13 @@ _INTRINSATTR
 static __inline uint64x2_t
 vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
 {
-	__v[__neon_lane_index(__v, __i)] = __x;
+	__v[__neon_laneq_index(__v, __i)] = __x;
 	return __v;
 }
 #elif defined(__clang__)
 #define	vsetq_lane_u64(__x, __v, __i)					      \
-	(uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v),    \
-	    __neon_lane_index(__v, __i));
+	(uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v),    \
+	    __neon_laneq_index(__v, __i));
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)
@@ -435,7 +492,7 @@ vshrq_n_u32(uint32x4_t __v, uint8_t __bi
 #endif
 }
 #elif defined(__clang__)
-#define	vshrq_n_u8(__v, __bits)						      \
+#define	vshrq_n_u32(__v, __bits)					      \
 	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
 #endif
 
@@ -488,6 +545,40 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t 
 #endif	/* __LITTLE_ENDIAN__ */
 #endif
 
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
+	    (int32x4_t)__vsh, __bits);
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	vsriq_n_u32(__vins, __vsh, __bits)				      \
+	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	      \
+	    (int32x4_t)(__vsh), (__bits), 34)
+#else
+#define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
+{									      \
+	int32x4_t __tvins = (__vins);					      \
+	int32x4_t __tvsh = (__vsh);					      \
+	uint8_t __tbits = (__bits);					      \
+	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
+	    3,2,1,0);							      \
+	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
+	    3,2,1,0);							      \
+	int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,    \
+	    34);							      \
+	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
+})
+#endif
+#endif
+
 _INTRINSATTR
 static __inline void
 vst1q_u32(uint32_t *__p32, uint32x4_t __v)
@@ -533,4 +624,58 @@ vst1q_u8(uint8_t *__p8, uint8x16_t __v)
 #endif
 }
 
+#ifndef __aarch64__		/* XXX */
+
+_INTRINSATTR
+static __inline uint8x8_t
+vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
+	    (int8x8_t)__idx);
+#elif defined(__clang__)
+	uint8x8_t __ret;
+#ifndef __LITTLE_ENDIAN__
+	__tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
+	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
+#endif
+	__ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
+	    (int8x8_t)__idx, 16);
+#ifndef __LITTLE_ENDIAN__
+	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
+#endif
+	return __ret;
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x8_t
+vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	union {
+		uint8x8x2_t __u8x8x82;
+		__builtin_neon_ti __ti;
+	} __u = { __tab };
+	return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
+#elif defined(__clang__)
+	uint8x8_t __ret;
+#ifndef __LITTLE_ENDIAN__
+	__tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
+	    7,6,5,4,3,2,1,0);
+	__tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
+	    7,6,5,4,3,2,1,0);
+	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
+#endif
+	__ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
+	    (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
+#ifndef __LITTLE_ENDIAN__
+	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
+#endif
+	return __ret;
+#endif
+}
+
+#endif	/* !defined(__aarch64__) */
+
 #endif	/* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */

Index: src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.4
--- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.3	Mon Jul 27 20:58:56 2020
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.4 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -43,6 +43,7 @@ typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #else
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
@@ -56,12 +57,18 @@ typedef __simd64_int8_t int8x8_t;
 typedef __simd64_uint8_t uint8x8_t;
 typedef __builtin_neon_udi uint64x1_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #endif
 
-#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
-#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#if defined(__AARCH64EB__)
+#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
+#define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
+#elif defined(__ARM_BIG_ENDIAN)
+#define	__neon_lane_index(__v, __i)	((__i) ^ (__arraycount(__v) - 1))
+#define	__neon_laneq_index(__v, __i)	((__i) ^ (__arraycount(__v)/2 - 1))
 #else
-#define	__neon_lane_index(__v, __i)	__i
+#define	__neon_lane_index(__v, __i)	(__i)
+#define	__neon_laneq_index(__v, __i)	(__i)
 #endif
 
 #elif defined(__clang__)
@@ -79,12 +86,16 @@ typedef __attribute__((neon_vector_type(
 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 
 #ifdef __LITTLE_ENDIAN__
 #define	__neon_lane_index(__v, __i)	__i
+#define	__neon_laneq_index(__v, __i)	__i
 #else
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #endif
 
 #else
@@ -168,7 +179,8 @@ _INTRINSATTR
 static __inline uint8x16_t
 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
 {
-#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+#ifdef __aarch64__
+#if defined(__AARCH64EB__)
 	return __builtin_shuffle(__hi, __lo,
 	    (uint8x16_t) {
 		16 - __i, 17 - __i, 18 - __i, 19 - __i,
@@ -185,6 +197,10 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h
 		__i + 12, __i + 13, __i + 14, __i + 15,
 	});
 #endif
+#else
+	return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo,
+	    (int8x16_t)__hi, __i);
+#endif
 }
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
@@ -222,7 +238,7 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _
 #elif defined(__clang__)
 #define	vgetq_lane_u32(__v, __i)					      \
 	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
-	    __neon_lane_index(__v, __i))
+	    __neon_laneq_index(__v, __i))
 #endif
 
 _INTRINSATTR
@@ -403,7 +419,7 @@ vrev32q_u16(uint16x8_t __v)
 #if defined(__GNUC__) && !defined(__clang__)
 	return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
 #elif defined(__clang__)
-	return __builtin_shufflevector(__v,  1,0, 3,2, 5,4, 7,6);
+	return __builtin_shufflevector(__v, __v,  1,0, 3,2, 5,4, 7,6);
 #endif
 }
 
@@ -415,7 +431,7 @@ vrev32q_u8(uint8x16_t __v)
 	return __builtin_shuffle(__v,
 	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
 #elif defined(__clang__)
-	return __builtin_shufflevector(__v,
+	return __builtin_shufflevector(__v, __v,
 	    3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
 #endif
 }
@@ -425,13 +441,13 @@ _INTRINSATTR
 static __inline uint32x4_t
 vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
 {
-	__v[__neon_lane_index(__v, __i)] = __x;
+	__v[__neon_laneq_index(__v, __i)] = __x;
 	return __v;
 }
 #elif defined(__clang__)
 #define	vsetq_lane_u32(__x, __v, __i)					      \
 	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
-	    __neon_lane_index(__v, __i))
+	    __neon_laneq_index(__v, __i))
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)
@@ -439,13 +455,13 @@ _INTRINSATTR
 static __inline uint64x2_t
 vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
 {
-	__v[__neon_lane_index(__v, __i)] = __x;
+	__v[__neon_laneq_index(__v, __i)] = __x;
 	return __v;
 }
 #elif defined(__clang__)
 #define	vsetq_lane_u64(__x, __v, __i)					      \
-	(uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v),    \
-	    __neon_lane_index(__v, __i));
+	(uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v),    \
+	    __neon_laneq_index(__v, __i));
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)
@@ -476,7 +492,7 @@ vshrq_n_u32(uint32x4_t __v, uint8_t __bi
 #endif
 }
 #elif defined(__clang__)
-#define	vshrq_n_u8(__v, __bits)						      \
+#define	vshrq_n_u32(__v, __bits)					      \
 	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
 #endif
 

Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.8
--- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.7	Tue Jul 28 20:08:48 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon.c,v 1.7 2020/07/28 20:08:48 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon.c,v 1.8 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -30,42 +30,18 @@
 #include <sys/endian.h>
 
 #include "arm_neon.h"
+#include "arm_neon_imm.h"
 #include "chacha_neon.h"
 
-static inline uint32x4_t
-vrolq_n_u32(uint32x4_t x, uint8_t n)
-{
-
-	/*
-	 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
-	 * practice it hurts performance at least on Cortex-A8.
-	 */
+/*
+ * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in practice
+ * it hurts performance at least on Cortex-A8.
+ */
 #if 1
-	return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
+#define	vrolq_n_u32(x, n)	(vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - (n)))
 #else
-	return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
+#define	vrolq_n_u32(x, n)	vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - (n))
 #endif
-}
-
-static inline uint32x4_t
-vhtole_u32(uint32x4_t x)
-{
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-	return x;
-#elif _BYTE_ORDER == _BIG_ENDIAN
-	return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
-#endif
-}
-
-static inline uint32x4_t
-vletoh_u32(uint32x4_t x)
-{
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-	return x;
-#elif _BYTE_ORDER == _BIG_ENDIAN
-	return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
-#endif
-}
 
 static inline uint32x4_t
 rol16(uint32x4_t x)
@@ -88,10 +64,10 @@ static inline uint32x4_t
 rol8(uint32x4_t x)
 {
 #if defined(__aarch64__)
-	static const uint8x16_t rol8_tab = {
+	static const uint8x16_t rol8_tab = VQ_N_U8(
 		  3, 0, 1, 2,  7, 4, 5, 6,
-		 11, 8, 9,10, 15,12,13,14,
-	};
+		 11, 8, 9,10, 15,12,13,14
+	);
 	uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
 
 	y8 = vqtbl1q_u8(x8, rol8_tab);
@@ -106,9 +82,9 @@ rol8(uint32x4_t x)
 	 * loops, but it doesn't and so attempting to use VTBL hurts
 	 * more than it helps.
 	 */
-	static const uint8x8_t rol8_tab = {
-		 3, 0, 1, 2,  7, 4, 5, 6,
-	};
+	static const uint8x8_t rol8_tab = V_N_U8(
+		 3, 0, 1, 2,  7, 4, 5, 6
+	);
 
 	uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
 
@@ -180,17 +156,17 @@ chacha_core_neon(uint8_t out[restrict st
 	uint32x4_t in0, in1, in2, in3;
 	uint32x4_t r0, r1, r2, r3;
 
-	r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
-	r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-	r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
-	r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
+	r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c));
+	r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+	r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
+	r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in));
 
 	chacha_permute(&r0, &r1, &r2, &r3, nr);
 
-	vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
-	vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
-	vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
-	vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
+	vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0)));
+	vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1)));
+	vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2)));
+	vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3)));
 }
 
 void
@@ -202,15 +178,15 @@ hchacha_neon(uint8_t out[restrict static
 {
 	uint32x4_t r0, r1, r2, r3;
 
-	r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
-	r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-	r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
-	r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
+	r0 = vreinterpretq_u32_u8(vld1q_u8(c));
+	r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+	r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
+	r3 = vreinterpretq_u32_u8(vld1q_u8(in));
 
 	chacha_permute(&r0, &r1, &r2, &r3, nr);
 
-	vst1q_u32((uint32_t *)out + 0, r0);
-	vst1q_u32((uint32_t *)out + 4, r3);
+	vst1q_u8(out + 0, vreinterpretq_u8_u32(r0));
+	vst1q_u8(out + 16, vreinterpretq_u8_u32(r3));
 }
 
 void
@@ -225,19 +201,20 @@ chacha_stream_neon(uint8_t *restrict s, 
 		chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
 
 	if (n) {
-		const uint32x4_t blkno_inc = {1,0,0,0};
+		const uint32x4_t blkno_inc = /* (1,0,0,0) */
+		    vsetq_lane_u32(1, vdupq_n_u32(0), 0);
 		uint32x4_t in0, in1, in2, in3;
 		uint32x4_t r0, r1, r2, r3;
 
-		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
-		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
-		in3 = (uint32x4_t) {
+		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
+		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
+		in3 = (uint32x4_t) VQ_N_U32(
 			blkno,
 			le32dec(nonce),
 			le32dec(nonce + 4),
 			le32dec(nonce + 8)
-		};
+		);
 
 		for (; n; s += 64, n -= 64) {
 			r0 = in0;
@@ -245,27 +222,27 @@ chacha_stream_neon(uint8_t *restrict s, 
 			r2 = in2;
 			r3 = in3;
 			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = vhtole_u32(vaddq_u32(r0, in0));
-			r1 = vhtole_u32(vaddq_u32(r1, in1));
-			r2 = vhtole_u32(vaddq_u32(r2, in2));
-			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			r0 = vaddq_u32(r0, in0);
+			r1 = vaddq_u32(r1, in1);
+			r2 = vaddq_u32(r2, in2);
+			r3 = vaddq_u32(r3, in3);
 
 			if (n < 64) {
 				uint8_t buf[64] __aligned(16);
 
-				vst1q_u32((uint32_t *)buf + 4*0, r0);
-				vst1q_u32((uint32_t *)buf + 4*1, r1);
-				vst1q_u32((uint32_t *)buf + 4*2, r2);
-				vst1q_u32((uint32_t *)buf + 4*3, r3);
+				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
+				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
+				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
+				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
 				memcpy(s, buf, n);
 
 				break;
 			}
 
-			vst1q_u32((uint32_t *)s + 4*0, r0);
-			vst1q_u32((uint32_t *)s + 4*1, r1);
-			vst1q_u32((uint32_t *)s + 4*2, r2);
-			vst1q_u32((uint32_t *)s + 4*3, r3);
+			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
+			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
+			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
+			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
 			in3 = vaddq_u32(in3, blkno_inc);
 		}
 	}
@@ -284,19 +261,20 @@ chacha_stream_xor_neon(uint8_t *s, const
 		    chacha_const32, nr);
 
 	if (n) {
-		const uint32x4_t blkno_inc = {1,0,0,0};
+		const uint32x4_t blkno_inc = /* (1,0,0,0) */
+		    vsetq_lane_u32(1, vdupq_n_u32(0), 0);
 		uint32x4_t in0, in1, in2, in3;
 		uint32x4_t r0, r1, r2, r3;
 
-		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
-		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
-		in3 = (uint32x4_t) {
+		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
+		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
+		in3 = (uint32x4_t) VQ_N_U32(
 			blkno,
 			le32dec(nonce),
 			le32dec(nonce + 4),
 			le32dec(nonce + 8)
-		};
+		);
 
 		for (; n; s += 64, p += 64, n -= 64) {
 			r0 = in0;
@@ -304,19 +282,19 @@ chacha_stream_xor_neon(uint8_t *s, const
 			r2 = in2;
 			r3 = in3;
 			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = vhtole_u32(vaddq_u32(r0, in0));
-			r1 = vhtole_u32(vaddq_u32(r1, in1));
-			r2 = vhtole_u32(vaddq_u32(r2, in2));
-			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			r0 = vaddq_u32(r0, in0);
+			r1 = vaddq_u32(r1, in1);
+			r2 = vaddq_u32(r2, in2);
+			r3 = vaddq_u32(r3, in3);
 
 			if (n < 64) {
 				uint8_t buf[64] __aligned(16);
 				unsigned i;
 
-				vst1q_u32((uint32_t *)buf + 4*0, r0);
-				vst1q_u32((uint32_t *)buf + 4*1, r1);
-				vst1q_u32((uint32_t *)buf + 4*2, r2);
-				vst1q_u32((uint32_t *)buf + 4*3, r3);
+				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
+				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
+				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
+				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
 
 				for (i = 0; i < n - n%4; i += 4)
 					le32enc(s + i,
@@ -327,14 +305,14 @@ chacha_stream_xor_neon(uint8_t *s, const
 				break;
 			}
 
-			r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
-			r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
-			r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
-			r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
-			vst1q_u32((uint32_t *)s + 4*0, r0);
-			vst1q_u32((uint32_t *)s + 4*1, r1);
-			vst1q_u32((uint32_t *)s + 4*2, r2);
-			vst1q_u32((uint32_t *)s + 4*3, r3);
+			r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0));
+			r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16));
+			r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32));
+			r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48));
+			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
+			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
+			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
+			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
 			in3 = vaddq_u32(in3, blkno_inc);
 		}
 	}

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.3
--- src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.2	Wed Jul 29 14:23:59 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $")
 
 	.fpu	neon
 
@@ -54,7 +54,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
  */
 
 .macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
-	vld1.32		{\c2-\c3}, [fp, :256]
+	vld1.8		{\c2-\c3}, [fp, :256]
 .endm
 
 .macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
@@ -80,7 +80,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vadd.u32	\c2, \c2, \d2
 	vadd.u32	\c3, \c3, \d3
 
-	vst1.32		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
+	vst1.8		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
 
 	veor		\c0, \b0, \c0
 	veor		\c1, \b1, \c1
@@ -118,7 +118,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vtbl.8		\d3l, {\d3l}, \c0l
 	vtbl.8		\d3h, {\d3h}, \c0l
 
-	vld1.32		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
+	vld1.8		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
 
 	/* c += d; b ^= c; b <<<= 7 */
 	vadd.u32	\c2, \c2, \d2
@@ -126,7 +126,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vadd.u32	\c0, \c0, \d0
 	vadd.u32	\c1, \c1, \d1
 
-	vst1.32		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
+	vst1.8		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
 
 	veor		\c2, \b2, \c2
 	veor		\c3, \b3, \c3
@@ -143,14 +143,6 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vsri.u32	\b1, \c3, #(32 - 7)
 .endm
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-#define	HTOLE32(x)
-#define	LE32TOH(x)
-#elif _BYTE_ORDER == _BIG_ENDIAN
-#define	HTOLE32(x)	vrev32.8	x, x
-#define	LE32TOH(x)	vrev32.8	x, x
-#endif
-
 	.text
 	.p2align 2
 .Lconstants_addr:
@@ -183,10 +175,16 @@ ENTRY(chacha_stream256_neon)
 	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
 	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
 
-	vld1.32	{q12}, [r4]	/* q12 := constant */
-	vld1.32	{q13-q14}, [r3]	/* q13-q14 := key */
+	vld1.8	{q12}, [r4]	/* q12 := constant */
+	vld1.8	{q13-q14}, [r3]	/* q13-q14 := key */
 	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
 
+#ifdef __ARM_BIG_ENDIAN
+	rev	r6, r6
+	rev	r8, r8
+	rev	r10, r10
+#endif
+
 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
 	vdup.32	q1, d24[1]
 	vdup.32	q2, d25[0]
@@ -205,23 +203,6 @@ ENTRY(chacha_stream256_neon)
 	vdup.32	q14, r8
 	vdup.32	q15, r10
 
-	HTOLE32(q0)
-	HTOLE32(q1)
-	HTOLE32(q2)
-	HTOLE32(q3)
-	HTOLE32(q4)
-	HTOLE32(q5)
-	HTOLE32(q6)
-	HTOLE32(q7)
-	HTOLE32(q8)
-	HTOLE32(q9)
-	HTOLE32(q10)
-	HTOLE32(q11)
-	HTOLE32(q12)
-	HTOLE32(q13)
-	HTOLE32(q14)
-	HTOLE32(q15)
-
 	b	2f
 
 	_ALIGN_TEXT
@@ -283,9 +264,9 @@ ENTRY(chacha_stream256_neon)
 	vzip.32	q6, q7
 
 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q9}, [r4]	/* q9 := constant */
+	vld1.8	{q9}, [r4]	/* q9 := constant */
 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q8}, [r3]!	/* q8 := key[0:16) */
+	vld1.8	{q8}, [r3]!	/* q8 := key[0:16) */
 
 	vswp	d1, d4
 	vswp	d9, d12
@@ -330,19 +311,10 @@ ENTRY(chacha_stream256_neon)
 	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 
-	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
-
-	LE32TOH(q0)
-	LE32TOH(q1)
-	LE32TOH(q2)
-	LE32TOH(q3)
-	LE32TOH(q4)
-	LE32TOH(q5)
-	LE32TOH(q6)
-	LE32TOH(q7)
+	vld1.8	{q8-q9}, [fp, :256]	/* restore q8-q9 */
 
-	vst1.32	{q0-q1}, [r0]!
-	vld1.32	{q0}, [r3]	/* q0 := key[16:32) */
+	vst1.8	{q0-q1}, [r0]!
+	vld1.8	{q0}, [r3]	/* q0 := key[16:32) */
 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
 	vmov	d2, r3, r6
 	vmov	d3, r8, r10
@@ -370,23 +342,14 @@ ENTRY(chacha_stream256_neon)
 	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 
-	LE32TOH(q8)
-	LE32TOH(q9)
-	LE32TOH(q10)
-	LE32TOH(q11)
-	LE32TOH(q12)
-	LE32TOH(q13)
-	LE32TOH(q14)
-	LE32TOH(q15)
-
-	/* vst1.32	{q0-q1}, [r0]! */
-	vst1.32	{q8-q9}, [r0]!
-	vst1.32	{q2-q3}, [r0]!
-	vst1.32	{q10-q11}, [r0]!
-	vst1.32	{q4-q5}, [r0]!
-	vst1.32	{q12-q13}, [r0]!
-	vst1.32 {q6-q7}, [r0]!
-	vst1.32 {q14-q15}, [r0]
+	/* vst1.8	{q0-q1}, [r0]! */
+	vst1.8	{q8-q9}, [r0]!
+	vst1.8	{q2-q3}, [r0]!
+	vst1.8	{q10-q11}, [r0]!
+	vst1.8	{q4-q5}, [r0]!
+	vst1.8	{q12-q13}, [r0]!
+	vst1.8	{q6-q7}, [r0]!
+	vst1.8	{q14-q15}, [r0]
 
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
@@ -426,10 +389,16 @@ ENTRY(chacha_stream_xor256_neon)
 	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
 	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
 
-	vld1.32	{q12}, [r5]	/* q12 := constant */
-	vld1.32	{q13-q14}, [r4]	/* q13-q14 := key */
+	vld1.8	{q12}, [r5]	/* q12 := constant */
+	vld1.8	{q13-q14}, [r4]	/* q13-q14 := key */
 	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
 
+#ifdef __ARM_BIG_ENDIAN
+	rev	r6, r6
+	rev	r8, r8
+	rev	r10, r10
+#endif
+
 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
 	vdup.32	q1, d24[1]
 	vdup.32	q2, d25[0]
@@ -448,23 +417,6 @@ ENTRY(chacha_stream_xor256_neon)
 	vdup.32	q14, r8
 	vdup.32	q15, r10
 
-	HTOLE32(q0)
-	HTOLE32(q1)
-	HTOLE32(q2)
-	HTOLE32(q3)
-	HTOLE32(q4)
-	HTOLE32(q5)
-	HTOLE32(q6)
-	HTOLE32(q7)
-	HTOLE32(q8)
-	HTOLE32(q9)
-	HTOLE32(q10)
-	HTOLE32(q11)
-	HTOLE32(q12)
-	HTOLE32(q13)
-	HTOLE32(q14)
-	HTOLE32(q15)
-
 	b	2f
 
 	_ALIGN_TEXT
@@ -496,9 +448,9 @@ ENTRY(chacha_stream_xor256_neon)
 	vzip.32	q6, q7
 
 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q9}, [r5]	/* q9 := constant */
+	vld1.8	{q9}, [r5]	/* q9 := constant */
 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q8}, [r4]!	/* q8 := key[0:16) */
+	vld1.8	{q8}, [r4]!	/* q8 := key[0:16) */
 
 	vswp	d3, d6
 	vswp	d9, d12
@@ -518,24 +470,15 @@ ENTRY(chacha_stream_xor256_neon)
 	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 
-	vld1.32 {q8-q9}, [r1]!	/* load plaintext bytes [0:32) */
-
-	LE32TOH(q0)
-	LE32TOH(q1)
-	LE32TOH(q2)
-	LE32TOH(q6)
-	LE32TOH(q4)
-	LE32TOH(q5)
-	LE32TOH(q3)
-	LE32TOH(q7)
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [0:32) */
 
 	veor	q0, q0, q8	/* compute ciphertext bytes [0:32) */
 	veor	q1, q1, q9
 
-	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
+	vld1.8	{q8-q9}, [fp, :256]	/* restore q8-q9 */
 
-	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
-	vld1.32	{q0}, [r4]	/* q0 := key[16:32) */
+	vst1.8	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
+	vld1.8	{q0}, [r4]	/* q0 := key[16:32) */
 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
 	vmov	d2, r3, r6
 	vmov	d3, r8, r10
@@ -563,57 +506,48 @@ ENTRY(chacha_stream_xor256_neon)
 	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 
-	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */
-
-	LE32TOH(q8)
-	LE32TOH(q9)
-	LE32TOH(q10)
-	LE32TOH(q11)
-	LE32TOH(q12)
-	LE32TOH(q13)
-	LE32TOH(q14)
-	LE32TOH(q15)
+	vld1.8	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */
 
 	veor	q0, q0, q8	/* compute ciphertext bytes [32:64) */
 	veor	q1, q1, q9
 
-	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
-	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
-	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
+	vst1.8	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
+	vld1.8	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */
 
 	veor	q2, q2, q8	/* compute ciphertext bytes [64:96) */
 	veor	q3, q3, q9
 
-	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
-	vst1.32	{q2-q3}, [r0]!	/* store ciphertext bytes [64:80) */
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
+	vst1.8	{q2-q3}, [r0]!	/* store ciphertext bytes [64:80) */
 
 	veor	q10, q10, q0	/* compute ciphertext bytes [96:128) */
 	veor	q11, q11, q1
 
-	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
-	vst1.32	{q10-q11}, [r0]!	/* store ciphertext bytes [80:96) */
+	vld1.8	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
+	vst1.8	{q10-q11}, [r0]!	/* store ciphertext bytes [80:96) */
 
 	veor	q4, q4, q8	/* compute ciphertext bytes [128:160) */
 	veor	q5, q5, q9
 
-	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
-	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [96:112) */
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
+	vst1.8	{q4-q5}, [r0]!	/* store ciphertext bytes [96:112) */
 
 	veor	q12, q12, q0	/* compute ciphertext bytes [160:192) */
 	veor	q13, q13, q1
 
-	vld1.32	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
-	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [112:128) */
+	vld1.8	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
+	vst1.8	{q12-q13}, [r0]!	/* store ciphertext bytes [112:128) */
 
 	veor	q6, q6, q8	/* compute ciphertext bytes [192:224) */
 	veor	q7, q7, q9
 
-	vst1.32	{q6-q7}, [r0]!	/* store ciphertext bytes [192:224) */
+	vst1.8	{q6-q7}, [r0]!	/* store ciphertext bytes [192:224) */
 
 	veor	q14, q14, q0	/* compute ciphertext bytes [224:256) */
 	veor	q15, q15, q1
 
-	vst1.32	{q14-q15}, [r0]	/* store ciphertext bytes [224:256) */
+	vst1.8	{q14-q15}, [r0]	/* store ciphertext bytes [224:256) */
 
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
@@ -637,5 +571,5 @@ END(v0123)
 
 	.type	rot8,%object
 rot8:
-	.long	0x02010003, 0x06050407
+	.byte	3,0,1,2, 7,4,5,6
 END(rot8)

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.5 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.6
--- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.5	Tue Jul 28 15:42:41 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Sat Aug  8 14:47:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include <aarch64/asm.h>
 
-RCSID("$NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $")
 
 #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
@@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon)
 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
 
-	HTOLE32(v0.16b)
-	HTOLE32(v1.16b)
-	HTOLE32(v2.16b)
-	HTOLE32(v3.16b)
-	HTOLE32(v4.16b)
-	HTOLE32(v5.16b)
-	HTOLE32(v6.16b)
-	HTOLE32(v7.16b)
-	HTOLE32(v8.16b)
-	HTOLE32(v9.16b)
-	HTOLE32(v10.16b)
-	HTOLE32(v11.16b)
-	HTOLE32(v12.16b)
-	HTOLE32(v13.16b)
-	HTOLE32(v14.16b)
-	HTOLE32(v15.16b)
+	LE32TOH(v0.16b)
+	LE32TOH(v1.16b)
+	LE32TOH(v2.16b)
+	LE32TOH(v3.16b)
+	LE32TOH(v4.16b)
+	LE32TOH(v5.16b)
+	LE32TOH(v6.16b)
+	LE32TOH(v7.16b)
+	LE32TOH(v8.16b)
+	LE32TOH(v9.16b)
+	LE32TOH(v10.16b)
+	LE32TOH(v11.16b)
+	/* LE32TOH(v12.16b) -- blkno, already host order */
+	LE32TOH(v13.16b)
+	LE32TOH(v14.16b)
+	LE32TOH(v15.16b)
 
 	mov	v16.16b, v0.16b
 	mov	v17.16b, v1.16b
@@ -234,22 +234,22 @@ ENTRY(chacha_stream256_neon)
 	add	v14.4s, v14.4s, v30.4s
 	add	v15.4s, v15.4s, v31.4s
 
-	LE32TOH(v0.16b)
-	LE32TOH(v1.16b)
-	LE32TOH(v2.16b)
-	LE32TOH(v3.16b)
-	LE32TOH(v4.16b)
-	LE32TOH(v5.16b)
-	LE32TOH(v6.16b)
-	LE32TOH(v7.16b)
-	LE32TOH(v8.16b)
-	LE32TOH(v9.16b)
-	LE32TOH(v10.16b)
-	LE32TOH(v11.16b)
-	LE32TOH(v12.16b)
-	LE32TOH(v13.16b)
-	LE32TOH(v14.16b)
-	LE32TOH(v15.16b)
+	HTOLE32(v0.16b)
+	HTOLE32(v1.16b)
+	HTOLE32(v2.16b)
+	HTOLE32(v3.16b)
+	HTOLE32(v4.16b)
+	HTOLE32(v5.16b)
+	HTOLE32(v6.16b)
+	HTOLE32(v7.16b)
+	HTOLE32(v8.16b)
+	HTOLE32(v9.16b)
+	HTOLE32(v10.16b)
+	HTOLE32(v11.16b)
+	HTOLE32(v12.16b)
+	HTOLE32(v13.16b)
+	HTOLE32(v14.16b)
+	HTOLE32(v15.16b)
 
 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
@@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon)
 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
 
-	HTOLE32(v0.16b)
-	HTOLE32(v1.16b)
-	HTOLE32(v2.16b)
-	HTOLE32(v3.16b)
-	HTOLE32(v4.16b)
-	HTOLE32(v5.16b)
-	HTOLE32(v6.16b)
-	HTOLE32(v7.16b)
-	HTOLE32(v8.16b)
-	HTOLE32(v9.16b)
-	HTOLE32(v10.16b)
-	HTOLE32(v11.16b)
-	HTOLE32(v12.16b)
-	HTOLE32(v13.16b)
-	HTOLE32(v14.16b)
-	HTOLE32(v15.16b)
+	LE32TOH(v0.16b)
+	LE32TOH(v1.16b)
+	LE32TOH(v2.16b)
+	LE32TOH(v3.16b)
+	LE32TOH(v4.16b)
+	LE32TOH(v5.16b)
+	LE32TOH(v6.16b)
+	LE32TOH(v7.16b)
+	LE32TOH(v8.16b)
+	LE32TOH(v9.16b)
+	LE32TOH(v10.16b)
+	LE32TOH(v11.16b)
+	/* LE32TOH(v12.16b) -- blkno, already host order */
+	LE32TOH(v13.16b)
+	LE32TOH(v14.16b)
+	LE32TOH(v15.16b)
 
 	mov	v16.16b, v0.16b
 	mov	v17.16b, v1.16b
@@ -401,22 +401,22 @@ ENTRY(chacha_stream_xor256_neon)
 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
 
-	LE32TOH(v0.16b)
-	LE32TOH(v1.16b)
-	LE32TOH(v2.16b)
-	LE32TOH(v3.16b)
-	LE32TOH(v4.16b)
-	LE32TOH(v5.16b)
-	LE32TOH(v6.16b)
-	LE32TOH(v7.16b)
-	LE32TOH(v8.16b)
-	LE32TOH(v9.16b)
-	LE32TOH(v10.16b)
-	LE32TOH(v11.16b)
-	LE32TOH(v12.16b)
-	LE32TOH(v13.16b)
-	LE32TOH(v14.16b)
-	LE32TOH(v15.16b)
+	HTOLE32(v0.16b)
+	HTOLE32(v1.16b)
+	HTOLE32(v2.16b)
+	HTOLE32(v3.16b)
+	HTOLE32(v4.16b)
+	HTOLE32(v5.16b)
+	HTOLE32(v6.16b)
+	HTOLE32(v7.16b)
+	HTOLE32(v8.16b)
+	HTOLE32(v9.16b)
+	HTOLE32(v10.16b)
+	HTOLE32(v11.16b)
+	HTOLE32(v12.16b)
+	HTOLE32(v13.16b)
+	HTOLE32(v14.16b)
+	HTOLE32(v15.16b)
 
 	eor	v16.16b, v16.16b, v0.16b
 	eor	v17.16b, v17.16b, v1.16b

Added files:

Index: src/sys/crypto/aes/arch/arm/arm_neon_imm.h
diff -u /dev/null src/sys/crypto/aes/arch/arm/arm_neon_imm.h:1.1
--- /dev/null	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon_imm.h	Sat Aug  8 14:47:01 2020
@@ -0,0 +1,80 @@
+/*	$NetBSD: arm_neon_imm.h,v 1.1 2020/08/08 14:47:01 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H
+#define	_SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H
+
+/*
+ * Non-standard macros for writing ARM NEON vector literals.  Needed
+ * because apparently GCC and Clang disagree wildly on the ordering of
+ * vector literal components -- and both disagree with the
+ * architectural indexing!
+ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__AARCH64EB__)
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{h,g,f,e,d,c,b,a}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{d,c, b,a}
+#elif defined(__ARM_BIG_ENDIAN)
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{h,g,f,e,d,c,b,a}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{h,g,f,e,d,c,b,a, p,o,n,m,l,k,j,i}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{b,a, d,c}
+#else
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{a,b,c,d,e,f,g,h}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{a,b, c,d}
+#endif
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{a,b,c,d,e,f,g,h}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{a,b, c,d}
+#else
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{h,g,f,e,d,c,b,a}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{h,g,f,e,d,c,b,a, p,o,n,m,l,k,i,j}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{d,c, b,a}
+#endif
+#endif
+
+#endif	/* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H */

Index: src/sys/crypto/chacha/arch/arm/arm_neon_imm.h
diff -u /dev/null src/sys/crypto/chacha/arch/arm/arm_neon_imm.h:1.1
--- /dev/null	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/chacha/arch/arm/arm_neon_imm.h	Sat Aug  8 14:47:01 2020
@@ -0,0 +1,80 @@
+/*	$NetBSD: arm_neon_imm.h,v 1.1 2020/08/08 14:47:01 riastradh Exp $	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H
+#define	_SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H
+
+/*
+ * Non-standard macros for writing ARM NEON vector literals.  Needed
+ * because apparently GCC and Clang disagree wildly on the ordering of
+ * vector literal components -- and both disagree with the
+ * architectural indexing!
+ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__AARCH64EB__)
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{h,g,f,e,d,c,b,a}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{d,c, b,a}
+#elif defined(__ARM_BIG_ENDIAN)
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{h,g,f,e,d,c,b,a}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{h,g,f,e,d,c,b,a, p,o,n,m,l,k,j,i}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{b,a, d,c}
+#else
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{a,b,c,d,e,f,g,h}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{a,b, c,d}
+#endif
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{a,b,c,d,e,f,g,h}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{a,b, c,d}
+#else
+#define	V_N_U8(a,b,c,d,e,f,g,h)						      \
+	{h,g,f,e,d,c,b,a}
+#define	VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p)			      \
+	{h,g,f,e,d,c,b,a, p,o,n,m,l,k,i,j}
+#define	VQ_N_U32(a,b,c,d)						      \
+	{d,c, b,a}
+#endif
+#endif
+
+#endif	/* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H */

Reply via email to