CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: jakllsch Date: Tue Sep 8 17:17:32 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: files.chacha_arm Log Message: use correct condition To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/files.chacha_arm Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/files.chacha_arm diff -u src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.4 --- src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.3 Tue Jul 28 20:08:48 2020 +++ src/sys/crypto/chacha/arch/arm/files.chacha_arm Tue Sep 8 17:17:32 2020 @@ -1,9 +1,9 @@ -# $NetBSD: files.chacha_arm,v 1.3 2020/07/28 20:08:48 riastradh Exp $ +# $NetBSD: files.chacha_arm,v 1.4 2020/09/08 17:17:32 jakllsch Exp $ ifdef aarch64 makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" else -makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon" +makeoptions chacha "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon" endif file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64)
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: jakllsch Date: Mon Sep 7 18:05:17 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon_64.S Log Message: Use a working macro to detect big endian aarch64. Fixes aarch64eb NEON ChaCha. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.6 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.7 --- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.6 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Mon Sep 7 18:05:17 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $") +RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $") #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ @@ -130,12 +130,12 @@ STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b #define STEP19(a,b,c,d, t, r) /* nothing */ #endif -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define HTOLE32(x) -#define LE32TOH(x) -#elif _BYTE_ORDER == _BIG_ENDIAN +#if defined(__AARCH64EB__) #define HTOLE32(x) rev32 x, x #define LE32TOH(x) rev32 x, x +#else +#define LE32TOH(x) +#define HTOLE32(x) #endif /*
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Sun Aug 23 16:39:06 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon_32.S Log Message: Adjust sp, not fp, to allocate a 32-byte temporary. Costs another couple MOV instructions, but we can't skimp on this -- there's no red zone below sp for interrupts on arm, so we can't touch anything there. So just use fp to save sp and then adjust sp itself, rather than using fp as a temporary register to point just below sp. Should fix PR port-arm/55598 -- previously the ChaCha self-test failed 33/1 trials triggered by sysctl during running system; with the patch it has failed 0/1 trials. (Presumably it happened more often at boot time, leading to 5/26 failures in the test bed, because we just enabled interrupts and some devices are starting to deliver interrupts.) To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.3 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.4 --- src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.3 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S Sun Aug 23 16:39:06 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $") +RCSID("$NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $") .fpu neon @@ -54,7 +54,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2 */ .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 - vld1.8 {\c2-\c3}, [fp, :256] + vld1.8 {\c2-\c3}, [sp, :256] .endm .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h @@ -80,7 +80,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2 vadd.u32 \c2, \c2, \d2 vadd.u32 \c3, \c3, \d3 - vst1.8 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ + vst1.8 {\c0-\c1}, [sp, :256] /* free c0 and c1 as temps */ veor \c0, \b0, \c0 veor \c1, \b1, \c1 @@ -118,7 +118,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2 vtbl.8 \d3l, {\d3l}, \c0l vtbl.8 \d3h, {\d3h}, \c0l - vld1.8 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ + vld1.8 {\c0-\c1}, [sp, :256] /* restore c0 and c1 */ /* c += d; b ^= c; b <<<= 7 */ vadd.u32 \c2, \c2, \d2 @@ -126,7 +126,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2 vadd.u32 \c0, \c0, \d0 vadd.u32 \c1, \c1, \d1 - vst1.8 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ + vst1.8 {\c2-\c3}, [sp, :256] /* free c2 and c3 as temps */ veor \c2, \b2, \c2 veor \c3, \b3, \c3 @@ -160,17 +160,18 @@ ENTRY(chacha_stream256_neon) /* save callee-saves registers */ push {r4, r5, r6, r7, r8, r10, fp, lr} vpush {d8-d15} + mov fp, sp /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ ldr r7, .Lconstants_addr adr r6, .Lconstants_addr /* reserve space for two 128-bit/16-byte q registers */ - sub fp, sp, #0x20 - bic fp, fp, #0x1f /* align */ + sub sp, sp, #0x20 + bic sp, sp, #0x1f /* align */ /* get parameters */ - add ip, sp, #96 + add ip, fp, #96 add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ ldm ip, {r4, r5} /* r4 := const, r5 := nr */ ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ @@ -311,7 +312,7 @@ ENTRY(chacha_stream256_neon) vadd.u32 q3, q3, q8 vadd.u32 q7, q7, q8 - vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */ + vld1.8 {q8-q9}, [sp, :256] /* restore q8-q9 */ vst1.8 {q0-q1}, [r0]! vld1.8 {q0}, [r3] /* q0 := key[16:32) */ @@ -354,9 +355,10 @@ ENTRY(chacha_stream256_neon) /* zero temporary space on the stack */ vmov.i32 q0, #0 vmov.i32 q1, #0 - vst1.8 {q0-q1}, [fp, :256] + vst1.8 {q0-q1}, [sp, :256] /* restore callee-saves registers and stack */ + mov sp, fp vpop {d8-d15} pop {r4, r5, r6, r7, r8, r10, fp, lr} bx lr @@ -374,17 +376,18 @@ ENTRY(chacha_stream_xor256_neon) /* save callee-saves registers */ push {r4, r5, r6, r7, r8, r10, fp, lr} vpush {d8-d15} + mov fp, sp /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ ldr r7, .Lconstants_addr adr r6, .Lconstants_addr /* reserve space for two 128-bit/16-byte q registers */ - sub fp, sp, #0x20 - bic fp, fp, #0x1f /* align */ + sub sp, sp, #0x20 + bic sp, sp, #0x1f /* align */ /* get parameters */ - add ip, sp, #96 + add ip, fp, #96 add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ @@ -475,7 +478,7 @@ ENTRY(chacha_stream_xor256_
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Wed Jul 29 14:23:59 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon_32.S Log Message: Issue three more swaps to save eight stores. Reduces code size and yields a small (~2%) cgd throughput boost. Remove duplicate comment while here. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.2 --- src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.1 Tue Jul 28 20:08:48 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S Wed Jul 29 14:23:59 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $ */ +/* $NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $") +RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $") .fpu neon @@ -305,21 +305,29 @@ ENTRY(chacha_stream256_neon) * q7 = (x3[4], x3[5]; x3[6], x3[7]) * * The first two rows to write out are q0 = x0[0:4) and q4 = - * x0[4:8). If we first swap q1 and q4, then once we've - * written them out we free up consecutive registers q0-q1 for - * store-multiple. + * x0[4:8). Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14 + * enables us to issue all stores in consecutive pairs: + * x0 in q0-q1 + * x1 in q8-q9 + * x2 in q2-q3 + * x3 in q10-q11 + * x4 in q4-q5 + * x5 in q12-q3 + * x6 in q6-q7 + * x7 in q14-q15 */ vswp q1, q4 + vswp q3, q6 vadd.u32 q0, q0, q9 vadd.u32 q4, q4, q9 vadd.u32 q2, q2, q9 - vadd.u32 q3, q3, q9 + vadd.u32 q6, q6, q9 vadd.u32 q1, q1, q8 vadd.u32 q5, q5, q8 - vadd.u32 q6, q6, q8 + vadd.u32 q3, q3, q8 vadd.u32 q7, q7, q8 vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ @@ -349,14 +357,17 @@ ENTRY(chacha_stream256_neon) vswp d19, d22 vswp d27, d30 + vswp q9, q12 + vswp q11, q14 + vadd.u32 q8, q8, q0 - vadd.u32 q9, q9, q0 + vadd.u32 q12, q12, q0 vadd.u32 q10, q10, q0 - vadd.u32 q11, q11, q0 + vadd.u32 q14, q14, q0 - vadd.u32 q12, q12, q1 + vadd.u32 q9, q9, q1 vadd.u32 q13, q13, q1 - vadd.u32 q14, q14, q1 + vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 LE32TOH(q8) @@ -368,28 +379,18 @@ ENTRY(chacha_stream256_neon) LE32TOH(q14) LE32TOH(q15) - /* prepare to zero temporary space on stack */ - vmov.i32 q0, #0 - vmov.i32 q1, #0 - - /* vst1.32 {q0}, [r0]! */ - /* vst1.32 {q1}, [r0]! */ /* (was q4 before vswp) */ - vst1.32 {q8}, [r0]! - vst1.32 {q12}, [r0]! - vst1.32 {q2}, [r0]! - vst1.32 {q6}, [r0]! - vst1.32 {q10}, [r0]! - vst1.32 {q14}, [r0]! - vst1.32 {q4}, [r0]! /* (was q1 before vswp) */ - vst1.32 {q5}, [r0]! - vst1.32 {q9}, [r0]! - vst1.32 {q13}, [r0]! - vst1.32 {q3}, [r0]! - vst1.32 {q7}, [r0]! - vst1.32 {q11}, [r0]! - vst1.32 {q15}, [r0] + /* vst1.32 {q0-q1}, [r0]! */ + vst1.32 {q8-q9}, [r0]! + vst1.32 {q2-q3}, [r0]! + vst1.32 {q10-q11}, [r0]! + vst1.32 {q4-q5}, [r0]! + vst1.32 {q12-q13}, [r0]! + vst1.32 {q6-q7}, [r0]! + vst1.32 {q14-q15}, [r0] /* zero temporary space on the stack */ + vmov.i32 q0, #0 + vmov.i32 q1, #0 vst1.8 {q0-q1}, [fp, :256] /* restore callee-saves registers and stack */ @@ -481,42 +482,8 @@ ENTRY(chacha_stream_xor256_neon) * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are * transposed from one another, and the x[i] are in general - * registers and memory. So we have: - * - * q0 = (x0[0], x1[0]; x2[0], x3[0]) - * q1 = (x0[1], x1[1]; x2[1], x3[1]) - * q2 = (x0[2], x1[2]; x2[2], x3[2]) - * q3 = (x0[3], x1[3]; x2[3], x3[3]) - * ... - * q15 = (x0[15], x1[15]; x2[15], x3[15]) - * - * where xi[j] is the jth word of the ith 16-word block. Zip - * consecutive pairs with vzip.32, and you get: - * - * q0 = (x0[0], x0[1]; x1[0], x1[1]) - * q1 = (x2[0], x2[1]; x3[0], x3[1]) - * q2 = (x0[2], x0[3]; x1[2], x1[3]) - * q3 = (x2[2], x2[3]; x3[2], x3[3]) - * ... - * q15 = (x2[14], x2[15]; x3[14], x3[15]) - * - * As 64-bit d registers, this is: - * - * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) - * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) - * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) - * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) - * ... - * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) - * - * Swap d1<->d4, d3<->d6, ..., and you get: - * - * q0 = (x0[0], x0[1]; x0[2], x0[3]) - * q1 = (x2[0], x2[1]; x2[2], x2[3]) - * q2 = (x1[0], x1[1]; x1[2], x1[3]) - * q3 = (x3[0], x3[1]; x3[2], x3[3]) - * ... - * q15 = (x15[0], x15[1]; x15[2], x15[3]) + * registers and memory. S
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jul 28 20:05:33 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon.c Log Message: Fix big-endian build with appropriate casts around vrev32q_u8. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/chacha/arch/arm/chacha_neon.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.6 --- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.5 Mon Jul 27 20:58:56 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Tue Jul 28 20:05:33 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */ +/* $NetBSD: chacha_neon.c,v 1.6 2020/07/28 20:05:33 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -53,7 +53,7 @@ vhtole_u32(uint32x4_t x) #if _BYTE_ORDER == _LITTLE_ENDIAN return x; #elif _BYTE_ORDER == _BIG_ENDIAN - return vrev32q_u8(x); + return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); #endif } @@ -63,7 +63,7 @@ vletoh_u32(uint32x4_t x) #if _BYTE_ORDER == _LITTLE_ENDIAN return x; #elif _BYTE_ORDER == _BIG_ENDIAN - return vrev32q_u8(x); + return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); #endif }
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jul 28 15:42:41 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon_64.S Log Message: Fix typo in comment. To generate a diff of this commit: cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.4 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.5 --- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.4 Mon Jul 27 20:57:23 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Tue Jul 28 15:42:41 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_64.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $ */ +/* $NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: chacha_neon_64.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $") +RCSID("$NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $") #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \ @@ -142,7 +142,7 @@ STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b * chacha_stream256_neon(uint8_t s[256]@x0, * uint32_t blkno@w1, * const uint8_t nonce[12]@x2, - * const uint8_t key[12]@x3, + * const uint8_t key[32]@x3, * const uint8_t const[16]@x4, * unsigned nr@w5) */
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:58:56 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c Log Message: Note that VSRI seems to hurt here. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/arm_neon.h diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.2 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.3 --- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.2 Mon Jul 27 20:58:06 2020 +++ src/sys/crypto/chacha/arch/arm/arm_neon.h Mon Jul 27 20:58:56 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -529,6 +529,40 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t #endif /* __LITTLE_ENDIAN__ */ #endif +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) +{ +#ifdef __aarch64__ + return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); +#else + return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, + (int32x4_t)__vsh, __bits); +#endif +} +#elif defined(__clang__) +#ifdef __LITTLE_ENDIAN__ +#define vsriq_n_u32(__vins, __vsh, __bits) \ + (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ + (int32x4_t)(__vsh), (__bits), 34) +#else +#define vsliq_n_s32(__vins, __vsh, __bits) ( \ +{ \ + int32x4_t __tvins = (__vins); \ + int32x4_t __tvsh = (__vsh); \ + uint8_t __tbits = (__bits); \ + int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ + 3,2,1,0); \ + int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ + 3,2,1,0); \ + int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,\ + 34); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif +#endif + _INTRINSATTR static __inline void vst1q_u32(uint32_t *__p32, uint32x4_t __v) Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.4 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.5 --- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.4 Mon Jul 27 20:58:06 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Jul 27 20:58:56 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */ +/* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -36,7 +36,15 @@ static inline uint32x4_t vrolq_n_u32(uint32x4_t x, uint8_t n) { + /* + * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in + * practice it hurts performance at least on Cortex-A8. + */ +#if 1 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); +#else + return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n); +#endif } static inline uint32x4_t
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:58:07 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c Log Message: Take advantage of REV32 and TBL for 16-bit and 8-bit rotations. However, disable use of (V)TBL on armv7/aarch32 for now, because for some reason GCC spills things to the stack despite having plenty of free registers, which hurts performance more than it helps at least on ARM Cortex-A8. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/arm_neon.h cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/chacha_neon.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/arm_neon.h diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.1 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.2 --- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.1 Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/arm_neon.h Mon Jul 27 20:58:06 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,6 +39,7 @@ typedef __Int32x4_t int32x4_t; typedef __Int64x2_t int64x2_t; typedef __Int8x16_t int8x16_t; +typedef __Uint16x8_t uint16x8_t; typedef __Uint32x4_t uint32x4_t; typedef __Uint64x2_t uint64x2_t; typedef __Uint8x16_t uint8x16_t; @@ -46,6 +47,7 @@ typedef __Uint8x16_t uint8x16_t; typedef __simd128_int32_t int32x4_t; typedef __simd128_int64_t int64x2_t; typedef __simd128_int8_t int8x16_t; +typedef __simd128_uint16_t uint16x8_t; typedef __simd128_uint32_t uint32x4_t; typedef __simd128_uint64_t uint64x2_t; typedef __simd128_uint8_t uint8x16_t; @@ -70,9 +72,11 @@ typedef struct { uint8x8_t val[2]; } uin typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; + typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; +typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; typedef struct { uint8x8_t val[2]; } uint8x8x2_t; @@ -330,6 +334,27 @@ vreinterpretq_s32_u8(uint8x16_t __v) } _INTRINSATTR +static __inline uint16x8_t +vreinterpretq_u16_u32(uint32x4_t __v) +{ + return (uint16x8_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u16(uint16x8_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR +static __inline uint32x4_t +vreinterpretq_u32_u64(uint64x2_t __v) +{ + return (uint32x4_t)__v; +} + +_INTRINSATTR static __inline uint32x4_t vreinterpretq_u32_u8(uint8x16_t __v) { @@ -338,6 +363,13 @@ vreinterpretq_u32_u8(uint8x16_t __v) _INTRINSATTR static __inline uint64x2_t +vreinterpretq_u64_u32(uint32x4_t __v) +{ + return (uint64x2_t)__v; +} + +_INTRINSATTR +static __inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t __v) { return (uint64x2_t)__v; @@ -365,6 +397,17 @@ vreinterpretq_u8_u64(uint64x2_t __v) } _INTRINSATTR +static __inline uint16x8_t +vrev32q_u16(uint16x8_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) + return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); +#elif defined(__clang__) + return __builtin_shufflevector(__v, 1,0, 3,2, 5,4, 7,6); +#endif +} + +_INTRINSATTR static __inline uint8x16_t vrev32q_u8(uint8x16_t __v) { @@ -531,4 +574,58 @@ vst1q_u8(uint8_t *__p8, uint8x16_t __v) #endif } +#ifndef __aarch64__ /* XXX */ + +_INTRINSATTR +static __inline uint8x8_t +vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, + (int8x8_t)__idx); +#elif defined(__clang__) + uint8x8_t __ret; +#ifndef __LITTLE_ENDIAN__ + __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); + __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); +#endif + __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, + (int8x8_t)__idx, 16); +#ifndef __LITTLE_ENDIAN__ + __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); +#endif + return __ret; +#endif +} + +_INTRINSATTR +static __inline uint8x8_t +vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) +{ +#if defined(__GNUC__) && !defined(__clang__) + union { + uint8x8x2_t __u8x8x82; + __builtin_neon_ti __ti; + } __u = { __tab }; + return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); +#elif defined(__clang__) + uint8x8_t __ret; +#ifndef __LITTLE_ENDIAN__ + __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], + 7,6,5,4,3,2,1,0); + __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], + 7,6,5
CVS commit: src/sys/crypto/chacha/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:50:25 UTC 2020 Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon_64.S Log Message: Use rather than copying things from it here. Vestige from userland build on netbsd-9 during development. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.2 --- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.1 Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S Mon Jul 27 20:50:25 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ +/* $NetBSD: chacha_neon_64.S,v 1.2 2020/07/27 20:50:25 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -26,23 +26,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ -.macro adrl reg, addr - adrp \reg, \addr - add \reg, \reg, #:lo12:\addr -.endm - -#define _ALIGN_TEXT \ - .p2align 4 - -#define ENTRY(x) \ - .text; \ - _ALIGN_TEXT; \ - .global x; \ - .type x,@function; \ -x: - -#define END(x) \ - .size x, . - x +#include #define ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \ STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r); \