Module Name:    src
Committed By:   matt
Date:           Tue Dec 18 06:05:56 UTC 2012

Modified Files:
        src/sys/arch/arm/cortex: cpu_in_cksum_asm_neon.S

Log Message:
Tighten up cpu_in_cksum_neon_v4hdr by 3 instructions.
Swap the doubles on a partial qword load on BE platforms.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S
diff -u src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.1 src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.2
--- src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.1	Mon Dec 17 00:44:03 2012
+++ src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S	Tue Dec 18 06:05:56 2012
@@ -28,9 +28,8 @@
  */
 
 #include <machine/asm.h>
-#include "assym.h"
 
-RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.1 2012/12/17 00:44:03 matt Exp $")
+RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.2 2012/12/18 06:05:56 matt Exp $")
 
 /*
  * uint32_t
@@ -102,6 +101,9 @@ END(cpu_in_cksum_neon)
 partial_qword:
 	str		lr, [sp, #-8]!	/* save LR */
 	vld1.64		{d4-d5}, [ip:128]!	/* fetch data */
+#ifdef __ARMEB__
+	vswp		d5, d4		/* on BE, MSW should be in d5 */
+#endif
 	veor		q0, q0, q0	/* create a null mask */
 	movs		r0, r1, lsl #3	/* any leading bytes? */
 	blne		_C_LABEL(__neon_leading_qword_bitmask)
@@ -123,16 +125,13 @@ partial_qword:
  * uint32_t cpu_in_cksum_neon_v4hdr(void *dptr)
  */
 ENTRY(cpu_in_cksum_neon_v4hdr)
-	veor		q1, q1, q1
 	bic		ip, r0, #7
 	vld1.32		{d0-d2},[ip]	/* it must be in 24 bytes */
-	mov		r1, #0		/* now we must clear one register */
 	tst		r0, #4		/* depending on 64-bit alignment */
 	beq		1f
 	vmov		s0, s5		/* move last U32 to first U32 */
 1:	vmovl.u32	q1, d2		/* move s5 to d3 and clear s5 */
-	vmovl.u16	q2, d0		/* 4 U16 -> 4 U32 */
-	vadd.u32	q3, q3, q2	/* add 4 U32 to accumulator */
+	vmovl.u16	q3, d0		/* 4 U16 -> 4 U32 */
 	vmovl.u16	q2, d1		/* 4 U16 -> 4 U32 */
 	vadd.u32	q3, q3, q2	/* add 4 U32 to accumulator */
 	vmovl.u16	q2, d2		/* 4 U16 -> 4 U32 */

Reply via email to