Module Name:    src
Committed By:   riastradh
Date:           Mon Jul 27 20:48:18 UTC 2020

Modified Files:
        src/sys/crypto/chacha/arch/arm: chacha_neon.c
        src/sys/crypto/chacha/arch/x86: chacha_sse2.c

Log Message:
Reduce some duplication.

Shouldn't substantively hurt performance -- the comparison that has
been moved into the loop was essentially the former loop condition --
and may improve performance by reducing code size since there's only
one inline call to chacha_permute instead of two.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/chacha_neon.c
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/x86/chacha_sse2.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.1 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.2
--- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.1	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c	Mon Jul 27 20:48:18 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon.c,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -168,7 +168,7 @@ chacha_stream_neon(uint8_t *restrict s, 
 			le32dec(nonce + 8)
 		};
 
-		for (; n >= 64; s += 64, n -= 64) {
+		for (; n; s += 64, n -= 64) {
 			r0 = in0;
 			r1 = in1;
 			r2 = in2;
@@ -178,32 +178,25 @@ chacha_stream_neon(uint8_t *restrict s, 
 			r1 = vhtole_u32(vaddq_u32(r1, in1));
 			r2 = vhtole_u32(vaddq_u32(r2, in2));
 			r3 = vhtole_u32(vaddq_u32(r3, in3));
+
+			if (n < 64) {
+				uint8_t buf[64] __aligned(16);
+
+				vst1q_u32((uint32_t *)buf + 4*0, r0);
+				vst1q_u32((uint32_t *)buf + 4*1, r1);
+				vst1q_u32((uint32_t *)buf + 4*2, r2);
+				vst1q_u32((uint32_t *)buf + 4*3, r3);
+				memcpy(s, buf, n);
+
+				break;
+			}
+
 			vst1q_u32((uint32_t *)s + 4*0, r0);
 			vst1q_u32((uint32_t *)s + 4*1, r1);
 			vst1q_u32((uint32_t *)s + 4*2, r2);
 			vst1q_u32((uint32_t *)s + 4*3, r3);
 			in3 = vaddq_u32(in3, blkno_inc);
 		}
-
-		if (n) {
-			uint8_t buf[64];
-
-			r0 = in0;
-			r1 = in1;
-			r2 = in2;
-			r3 = in3;
-			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = vhtole_u32(vaddq_u32(r0, in0));
-			r1 = vhtole_u32(vaddq_u32(r1, in1));
-			r2 = vhtole_u32(vaddq_u32(r2, in2));
-			r3 = vhtole_u32(vaddq_u32(r3, in3));
-			vst1q_u32((uint32_t *)buf + 4*0, r0);
-			vst1q_u32((uint32_t *)buf + 4*1, r1);
-			vst1q_u32((uint32_t *)buf + 4*2, r2);
-			vst1q_u32((uint32_t *)buf + 4*3, r3);
-
-			memcpy(s, buf, n);
-		}
 	}
 }
 
@@ -234,7 +227,7 @@ chacha_stream_xor_neon(uint8_t *s, const
 			le32dec(nonce + 8)
 		};
 
-		for (; n >= 64; s += 64, p += 64, n -= 64) {
+		for (; n; s += 64, p += 64, n -= 64) {
 			r0 = in0;
 			r1 = in1;
 			r2 = in2;
@@ -244,6 +237,25 @@ chacha_stream_xor_neon(uint8_t *s, const
 			r1 = vhtole_u32(vaddq_u32(r1, in1));
 			r2 = vhtole_u32(vaddq_u32(r2, in2));
 			r3 = vhtole_u32(vaddq_u32(r3, in3));
+
+			if (n < 64) {
+				uint8_t buf[64] __aligned(16);
+				unsigned i;
+
+				vst1q_u32((uint32_t *)buf + 4*0, r0);
+				vst1q_u32((uint32_t *)buf + 4*1, r1);
+				vst1q_u32((uint32_t *)buf + 4*2, r2);
+				vst1q_u32((uint32_t *)buf + 4*3, r3);
+
+				for (i = 0; i < n - n%4; i += 4)
+					le32enc(s + i,
+					    le32dec(p + i) ^ le32dec(buf + i));
+				for (; i < n; i++)
+					s[i] = p[i] ^ buf[i];
+
+				break;
+			}
+
 			r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
 			r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
 			r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
@@ -254,31 +266,6 @@ chacha_stream_xor_neon(uint8_t *s, const
 			vst1q_u32((uint32_t *)s + 4*3, r3);
 			in3 = vaddq_u32(in3, blkno_inc);
 		}
-
-		if (n) {
-			uint8_t buf[64];
-			unsigned i;
-
-			r0 = in0;
-			r1 = in1;
-			r2 = in2;
-			r3 = in3;
-			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = vhtole_u32(vaddq_u32(r0, in0));
-			r1 = vhtole_u32(vaddq_u32(r1, in1));
-			r2 = vhtole_u32(vaddq_u32(r2, in2));
-			r3 = vhtole_u32(vaddq_u32(r3, in3));
-			vst1q_u32((uint32_t *)buf + 4*0, r0);
-			vst1q_u32((uint32_t *)buf + 4*1, r1);
-			vst1q_u32((uint32_t *)buf + 4*2, r2);
-			vst1q_u32((uint32_t *)buf + 4*3, r3);
-
-			for (i = 0; i < n - n%4; i += 4)
-				le32enc(s + i,
-				    le32dec(p + i) ^ le32dec(buf + i));
-			for (; i < n; i++)
-				s[i] = p[i] ^ buf[i];
-		}
 	}
 }
 

Index: src/sys/crypto/chacha/arch/x86/chacha_sse2.c
diff -u src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.1 src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.2
--- src/sys/crypto/chacha/arch/x86/chacha_sse2.c:1.1	Sat Jul 25 22:49:20 2020
+++ src/sys/crypto/chacha/arch/x86/chacha_sse2.c	Mon Jul 27 20:48:18 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_sse2.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $	*/
+/*	$NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -313,7 +313,7 @@ out:	if (n) {
 		in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
 		    le32dec(nonce), blkno);
 
-		for (; n >= 64; s += 64, n -= 64) {
+		for (; n; s += 64, n -= 64) {
 			r0 = in0;
 			r1 = in1;
 			r2 = in2;
@@ -323,36 +323,25 @@ out:	if (n) {
 			r1 = _mm_add_epi32(r1, in1);
 			r2 = _mm_add_epi32(r2, in2);
 			r3 = _mm_add_epi32(r3, in3);
+
+			if (n < 64) {
+				uint8_t buf[64] __aligned(16);
+
+				_mm_storeu_si128((__m128i *)buf + 0, r0);
+				_mm_storeu_si128((__m128i *)buf + 1, r1);
+				_mm_storeu_si128((__m128i *)buf + 2, r2);
+				_mm_storeu_si128((__m128i *)buf + 3, r3);
+				memcpy(s, buf, n);
+
+				break;
+			}
+
 			_mm_storeu_si128((__m128i *)s + 0, r0);
 			_mm_storeu_si128((__m128i *)s + 1, r1);
 			_mm_storeu_si128((__m128i *)s + 2, r2);
 			_mm_storeu_si128((__m128i *)s + 3, r3);
 			in3 = _mm_add_epi32(in3, blkno_inc);
 		}
-
-		if (n) {
-			uint8_t buf[64];
-			unsigned i;
-
-			r0 = in0;
-			r1 = in1;
-			r2 = in2;
-			r3 = in3;
-			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = _mm_add_epi32(r0, in0);
-			r1 = _mm_add_epi32(r1, in1);
-			r2 = _mm_add_epi32(r2, in2);
-			r3 = _mm_add_epi32(r3, in3);
-			_mm_storeu_si128((__m128i *)buf + 0, r0);
-			_mm_storeu_si128((__m128i *)buf + 1, r1);
-			_mm_storeu_si128((__m128i *)buf + 2, r2);
-			_mm_storeu_si128((__m128i *)buf + 3, r3);
-
-			for (i = 0; i < n - n%4; i += 4)
-				le32enc(s + i, le32dec(buf + i));
-			for (; i < n; i++)
-				s[i] = buf[i];
-		}
 	}
 }
 
@@ -480,7 +469,7 @@ out:	if (n) {
 		in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
 		    le32dec(nonce), blkno);
 
-		for (; n >= 64; s += 64, p += 64, n -= 64) {
+		for (; n; s += 64, p += 64, n -= 64) {
 			r0 = in0;
 			r1 = in1;
 			r2 = in2;
@@ -490,6 +479,25 @@ out:	if (n) {
 			r1 = _mm_add_epi32(r1, in1);
 			r2 = _mm_add_epi32(r2, in2);
 			r3 = _mm_add_epi32(r3, in3);
+
+			if (n < 64) {
+				uint8_t buf[64] __aligned(16);
+				unsigned i;
+
+				_mm_storeu_si128((__m128i *)buf + 0, r0);
+				_mm_storeu_si128((__m128i *)buf + 1, r1);
+				_mm_storeu_si128((__m128i *)buf + 2, r2);
+				_mm_storeu_si128((__m128i *)buf + 3, r3);
+
+				for (i = 0; i < n - n%4; i += 4)
+					le32enc(s + i,
+					    le32dec(p + i) ^ le32dec(buf + i));
+				for (; i < n; i++)
+					s[i] = p[i] ^ buf[i];
+
+				break;
+			}
+
 			r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
 			r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
 			r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
@@ -500,31 +508,6 @@ out:	if (n) {
 			_mm_storeu_si128((__m128i *)s + 3, r3);
 			in3 = _mm_add_epi32(in3, blkno_inc);
 		}
-
-		if (n) {
-			uint8_t buf[64];
-			unsigned i;
-
-			r0 = in0;
-			r1 = in1;
-			r2 = in2;
-			r3 = in3;
-			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = _mm_add_epi32(r0, in0);
-			r1 = _mm_add_epi32(r1, in1);
-			r2 = _mm_add_epi32(r2, in2);
-			r3 = _mm_add_epi32(r3, in3);
-			_mm_storeu_si128((__m128i *)buf + 0, r0);
-			_mm_storeu_si128((__m128i *)buf + 1, r1);
-			_mm_storeu_si128((__m128i *)buf + 2, r2);
-			_mm_storeu_si128((__m128i *)buf + 3, r3);
-
-			for (i = 0; i < n - n%4; i += 4)
-				le32enc(s + i,
-				    le32dec(p + i) ^ le32dec(buf + i));
-			for (; i < n; i++)
-				s[i] = p[i] ^ buf[i];
-		}
 	}
 }
 

Reply via email to