Module Name: src Committed By: riastradh Date: Mon Jul 27 20:48:18 UTC 2020
Modified Files: src/sys/crypto/chacha/arch/arm: chacha_neon.c src/sys/crypto/chacha/arch/x86: chacha_sse2.c Log Message: Reduce some duplication. Shouldn't substantively hurt performance -- the comparison that has been moved into the loop was essentially the former loop condition -- and may improve performance by reducing code size since there's only one inline call to chacha_permute instead of two. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/chacha_neon.c cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/x86/chacha_sse2.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.1 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.2 --- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.1 Sat Jul 25 22:51:57 2020 +++ src/sys/crypto/chacha/arch/arm/chacha_neon.c Mon Jul 27 20:48:18 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $ */ +/* $NetBSD: chacha_neon.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -168,7 +168,7 @@ chacha_stream_neon(uint8_t *restrict s, le32dec(nonce + 8) }; - for (; n >= 64; s += 64, n -= 64) { + for (; n; s += 64, n -= 64) { r0 = in0; r1 = in1; r2 = in2; @@ -178,32 +178,25 @@ chacha_stream_neon(uint8_t *restrict s, r1 = vhtole_u32(vaddq_u32(r1, in1)); r2 = vhtole_u32(vaddq_u32(r2, in2)); r3 = vhtole_u32(vaddq_u32(r3, in3)); + + if (n < 64) { + uint8_t buf[64] __aligned(16); + + vst1q_u32((uint32_t *)buf + 4*0, r0); + vst1q_u32((uint32_t *)buf + 4*1, r1); + vst1q_u32((uint32_t *)buf + 4*2, r2); + vst1q_u32((uint32_t *)buf + 4*3, r3); + memcpy(s, buf, n); + + break; + } + vst1q_u32((uint32_t *)s + 4*0, r0); vst1q_u32((uint32_t *)s + 4*1, r1); vst1q_u32((uint32_t *)s + 4*2, r2); vst1q_u32((uint32_t *)s + 4*3, r3); in3 = vaddq_u32(in3, blkno_inc); } - - if (n) { - uint8_t buf[64]; - - r0 = in0; - r1 = in1; - r2 = in2; - r3 = in3; - chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = vhtole_u32(vaddq_u32(r0, in0)); - r1 = vhtole_u32(vaddq_u32(r1, in1)); - r2 = vhtole_u32(vaddq_u32(r2, in2)); - r3 = vhtole_u32(vaddq_u32(r3, in3)); - vst1q_u32((uint32_t *)buf + 4*0, r0); - vst1q_u32((uint32_t *)buf + 4*1, r1); - vst1q_u32((uint32_t *)buf + 4*2, r2); - vst1q_u32((uint32_t *)buf + 4*3, r3); - - memcpy(s, buf, n); - } } } @@ -234,7 +227,7 @@ chacha_stream_xor_neon(uint8_t *s, const le32dec(nonce + 8) }; - for (; n >= 64; s += 64, p += 64, n -= 64) { + for (; n; s += 64, p += 64, n -= 64) { r0 = in0; r1 = in1; r2 = in2; @@ -244,6 +237,25 @@ chacha_stream_xor_neon(uint8_t *s, const r1 = vhtole_u32(vaddq_u32(r1, in1)); r2 = vhtole_u32(vaddq_u32(r2, in2)); r3 = vhtole_u32(vaddq_u32(r3, in3)); + + if (n < 64) { + uint8_t buf[64] __aligned(16); + unsigned i; + + vst1q_u32((uint32_t *)buf + 4*0, r0); + vst1q_u32((uint32_t *)buf + 4*1, r1); + vst1q_u32((uint32_t *)buf + 4*2, r2); + vst1q_u32((uint32_t *)buf + 4*3, r3); + + for (i = 0; i < n - n%4; i += 4) + le32enc(s + i, + le32dec(p + i) ^ le32dec(buf + i)); + for (; i < n; i++) + s[i] = p[i] ^ buf[i]; + + break; + } + r0 ^= vld1q_u32((const uint32_t *)p + 4*0); r1 ^= vld1q_u32((const uint32_t *)p + 4*1); r2 ^= vld1q_u32((const uint32_t *)p + 4*2); @@ -254,31 +266,6 @@ chacha_stream_xor_neon(uint8_t *s, const vst1q_u32((uint32_t *)s + 4*3, r3); in3 = vaddq_u32(in3, blkno_inc); } - - if (n) { - uint8_t buf[64]; - unsigned i; - - r0 = in0; - r1 = in1; - r2 = in2; - r3 = in3; - chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = vhtole_u32(vaddq_u32(r0, in0)); - r1 = vhtole_u32(vaddq_u32(r1, in1)); - r2 = vhtole_u32(vaddq_u32(r2, in2)); - r3 = vhtole_u32(vaddq_u32(r3, in3)); - vst1q_u32((uint32_t *)buf + 4*0, r0); - vst1q_u32((uint32_t *)buf + 4*1, r1); - vst1q_u32((uint32_t *)buf + 4*2, r2); - vst1q_u32((uint32_t *)buf + 4*3, r3); - - for (i = 0; i < n - n%4; i += 4) - le32enc(s + i, - le32dec(p + i) ^ le32dec(buf + i)); - for (; i < n; i++) - s[i] = p[i] ^ buf[i]; - } } } Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.c diff -u src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.1 src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.2 --- src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.1 Sat Jul 25 22:49:20 2020 +++ src/sys/crypto/chacha/arch/x86/chacha_sse2.c Mon Jul 27 20:48:18 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */ +/* $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -313,7 +313,7 @@ out: if (n) { in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), le32dec(nonce), blkno); - for (; n >= 64; s += 64, n -= 64) { + for (; n; s += 64, n -= 64) { r0 = in0; r1 = in1; r2 = in2; @@ -323,36 +323,25 @@ out: if (n) { r1 = _mm_add_epi32(r1, in1); r2 = _mm_add_epi32(r2, in2); r3 = _mm_add_epi32(r3, in3); + + if (n < 64) { + uint8_t buf[64] __aligned(16); + + _mm_storeu_si128((__m128i *)buf + 0, r0); + _mm_storeu_si128((__m128i *)buf + 1, r1); + _mm_storeu_si128((__m128i *)buf + 2, r2); + _mm_storeu_si128((__m128i *)buf + 3, r3); + memcpy(s, buf, n); + + break; + } + _mm_storeu_si128((__m128i *)s + 0, r0); _mm_storeu_si128((__m128i *)s + 1, r1); _mm_storeu_si128((__m128i *)s + 2, r2); _mm_storeu_si128((__m128i *)s + 3, r3); in3 = _mm_add_epi32(in3, blkno_inc); } - - if (n) { - uint8_t buf[64]; - unsigned i; - - r0 = in0; - r1 = in1; - r2 = in2; - r3 = in3; - chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = _mm_add_epi32(r0, in0); - r1 = _mm_add_epi32(r1, in1); - r2 = _mm_add_epi32(r2, in2); - r3 = _mm_add_epi32(r3, in3); - _mm_storeu_si128((__m128i *)buf + 0, r0); - _mm_storeu_si128((__m128i *)buf + 1, r1); - _mm_storeu_si128((__m128i *)buf + 2, r2); - _mm_storeu_si128((__m128i *)buf + 3, r3); - - for (i = 0; i < n - n%4; i += 4) - le32enc(s + i, le32dec(buf + i)); - for (; i < n; i++) - s[i] = buf[i]; - } } } @@ -480,7 +469,7 @@ out: if (n) { in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4), le32dec(nonce), blkno); - for (; n >= 64; s += 64, p += 64, n -= 64) { + for (; n; s += 64, p += 64, n -= 64) { r0 = in0; r1 = in1; r2 = in2; @@ -490,6 +479,25 @@ out: if (n) { r1 = _mm_add_epi32(r1, in1); r2 = _mm_add_epi32(r2, in2); r3 = _mm_add_epi32(r3, in3); + + if (n < 64) { + uint8_t buf[64] __aligned(16); + unsigned i; + + _mm_storeu_si128((__m128i *)buf + 0, r0); + _mm_storeu_si128((__m128i *)buf + 1, r1); + _mm_storeu_si128((__m128i *)buf + 2, r2); + _mm_storeu_si128((__m128i *)buf + 3, r3); + + for (i = 0; i < n - n%4; i += 4) + le32enc(s + i, + le32dec(p + i) ^ le32dec(buf + i)); + for (; i < n; i++) + s[i] = p[i] ^ buf[i]; + + break; + } + r0 ^= _mm_loadu_si128((const __m128i *)p + 0); r1 ^= _mm_loadu_si128((const __m128i *)p + 1); r2 ^= _mm_loadu_si128((const __m128i *)p + 2); @@ -500,31 +508,6 @@ out: if (n) { _mm_storeu_si128((__m128i *)s + 3, r3); in3 = _mm_add_epi32(in3, blkno_inc); } - - if (n) { - uint8_t buf[64]; - unsigned i; - - r0 = in0; - r1 = in1; - r2 = in2; - r3 = in3; - chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = _mm_add_epi32(r0, in0); - r1 = _mm_add_epi32(r1, in1); - r2 = _mm_add_epi32(r2, in2); - r3 = _mm_add_epi32(r3, in3); - _mm_storeu_si128((__m128i *)buf + 0, r0); - _mm_storeu_si128((__m128i *)buf + 1, r1); - _mm_storeu_si128((__m128i *)buf + 2, r2); - _mm_storeu_si128((__m128i *)buf + 3, r3); - - for (i = 0; i < n - n%4; i += 4) - le32enc(s + i, - le32dec(p + i) ^ le32dec(buf + i)); - for (; i < n; i++) - s[i] = p[i] ^ buf[i]; - } } }