Module Name:    src
Committed By:   matt
Date:           Thu Dec 20 07:18:33 UTC 2012

Modified Files:
        src/sys/arch/arm/arm: cpu_in_cksum_buffer.S

Log Message:
Make this work.  Various fixes and some further optimizations.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/arch/arm/arm/cpu_in_cksum_buffer.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/arm/arm/cpu_in_cksum_buffer.S
diff -u src/sys/arch/arm/arm/cpu_in_cksum_buffer.S:1.1 src/sys/arch/arm/arm/cpu_in_cksum_buffer.S:1.2
--- src/sys/arch/arm/arm/cpu_in_cksum_buffer.S:1.1	Wed Dec 19 15:05:16 2012
+++ src/sys/arch/arm/arm/cpu_in_cksum_buffer.S	Thu Dec 20 07:18:33 2012
@@ -29,7 +29,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: cpu_in_cksum_buffer.S,v 1.1 2012/12/19 15:05:16 matt Exp $")
+RCSID("$NetBSD: cpu_in_cksum_buffer.S,v 1.2 2012/12/20 07:18:33 matt Exp $")
 
 /*
  * Special note:
@@ -66,7 +66,7 @@ ENTRY(cpu_in_cksum_buffer)
 #ifndef __OPTIMIZE_SIZE__
 	rsb	r3, r3, #64		/* subtract from 64 */
 #ifdef _ARM_ARCH_DWORD_OK
-	add	r3, r3, r1, lsr #1	/* multiply by 1.5 */
+	add	r3, r3, r3, lsr #1	/* multiply by 1.5 */
 	add	pc, pc, r3		/* and jump! */
 #else
 	add	pc, pc, r3, lsl #1	/* multiply by 2 and jump! */
@@ -97,6 +97,7 @@ ENTRY(cpu_in_cksum_buffer)
 	LOAD_DWORD_INTO_R4(r0)		/* 1 dword left */
 .Ladd_one_dword:
 	adcs	ip, ip, r4
+.Ladd_one_word:
 	adcs	ip, ip, r5
 	teq	r2, r0			/* nothing left? */
 	beq	.Lfold			/*   yep, proceed to hold */
@@ -107,13 +108,14 @@ ENTRY(cpu_in_cksum_buffer)
 	bne	4b			/*   yep, do 64 at time */
 #endif
 	bics	r3, r1, #7		/* at least 8 bytes left? */
-	bge	3b			/*   yep, do them */
+	bne	3b			/*   yep, do them */
 
 .Lfinal_dword:
-	tst	r1, #4			/* more than one word more left? */
+	sub	r3, r1, #1		/* 0-3 = 1 word, 4-7 = 2 words */
+	tst	r3, #4			/* more than one word more left? */
 	moveq	r4, #0			/*   no, just use zero */
-	ldrne	r4, [r0], #4		/*   yes, load first word */
-	ldr	r5, [r0]		/* load last word */
+	ldreq	r5, [r0]		/*   no, load last word */
+	ldmneia	r0, {r4-r5}		/*   yes, load last dword */
 .Lfinal_dword_noload:
 	rsb	r1, r1, #4		/* find out many bytes to discard */
 #ifdef __ARMEL__
@@ -127,7 +129,9 @@ ENTRY(cpu_in_cksum_buffer)
 	tst	r1, #1			/* discard odd? */
 	bicne	r5, r5, #0x000000ff	/*   yes, discard odd byte */
 #endif
-	adds	ip, ip, r4		/* add 1st to accumulator */
+.Lfinal_add_one_dword:
+	adcs	ip, ip, r4		/* add 1st to accumulator */
+.Lfinal_add_one_word:
 	adcs	ip, ip, r5		/* add 2nd to accumulator */
 
 	/*
@@ -143,33 +147,58 @@ ENTRY(cpu_in_cksum_buffer)
 #include "cpu_in_cksum_fold.S"
 
 .Ldword_misaligned:
+	tst	r0, #3			/* are at least word aligned? */
+	bne	.Lword_misaligned	/*   no, do it the hard way */
+	ldr	r5, [r0], #4		/* load word here in case of partial */
+	sub	r1, r1, #4		/* subtract length of one word */
+	teq	r1, #0 			/* what is length? */
+	beq	.Lfinal_add_one_word	/*   = 0? just do the final add */
+	addgt	r2, r1, r0		/*   > 0? point r2 just past end */
+	bgt	.Ladd_one_word		/*   > 0? accumulate it and loop */
+	mov	r4, #0			/*   < 0? zero this */
+	b	.Lfinal_dword_noload	/*   < 0? handle final partial dword */
+
+.Lword_misaligned:
+	tst	r0, #4			/* do we load 1 or 2 words? */
 	bic	r0, r0, #3		/* force word alignment */
-	add	r1, r1, r2		/* add misalignment to length */
-	tst	r2, #4			/* first  */
-	ldr	r4, [r0], #4		/* load first word */
+	add	r1, r1, r2		/* add initial offset to length */
+	sub	r1, r1, #8		/* subtract length of one dword */
+	ldmeqia	r0!, {r4-r5}		/* load first dword */
+	ldrne	r4, [r0], #4		/* load first word */
 	movne	r5, #0			/* no second word */
-	ldreq	r5, [r0], #4		/* load second word */
 	/*
 	 * We are now dword aligned.
 	 */
 #ifdef __ARMEL__
 	tst	r2, #2			/* discard at least 2? */
 	movne	r4, r4, lsr #16		/* yes, discard lower halfword */
-	tst	r2, #1			/* discard odd? */
-	bicne	r4, r4, #0x0000ff00	/* yes, discard odd byte */
+	tst	r2, #1			/* start odd? */
+	bicne	r4, r4, #0x000000ff	/* yes, discard even byte */
 #else
 	tst	r2, #2			/* discard at least 2? */
 	movne	r4, r4, lsl #16		/* yes, discard upper halfword */
-	tst	r2, #1			/* discard odd? */
-	bicne	r4, r4, #0x00ff0000	/* yes, discard odd byte */
+	tst	r2, #1			/* start odd? */
+	bicne	r4, r4, #0xff000000	/* yes, discard even byte */
 #endif
 	/*
+	 * Since we started on an odd boundary, set up our stack frame so we
+	 * fixup the return value to be byteswapped.
+	 */
+	ldrne	r3, [sp, #4]		/* pop r5 */
+	strne	r3, [sp, #-4]!		/* push it again */ 
+	ldrne	r3, [sp, #4]		/* pop r4 */
+	strne	r3, [sp, #-4]!		/* push it again */
+	strne	lr, [sp, #8]		/* save our return address */
+	adrne	lr, .Lmisaligned_fixup	/* use new to fixup the return value */
+	/*
 	 * See if we have a least a full dword to process.  If we do, jump
 	 * into the main loop as if we just load a single dword.
 	 */
-	bics	r3, r1, #7		/* at least one dword? */
-	addne	r2, r1, r0		/*   yes, point r2 just past end */
-	bne	.Ladd_one_dword		/*   yes, accumulate it and loop */
+	teq	r1, #0 			/* what is length? */
+	beq	.Lfinal_add_one_word	/*   = 0? just do the final add */
+	addgt	r2, r1, r0		/*   > 0? point r2 just past end */
+	bgt	.Ladd_one_dword		/*   > 0? accumulate it and loop */
+
 	/*
 	 * Not a full dword so do the final dword processing to find out
 	 * bytes to discard.  If we only loaded one word, move it to 2nd
@@ -177,7 +206,23 @@ ENTRY(cpu_in_cksum_buffer)
 	 * clear the 1st word.
 	 */
 	tst	r2, #4			/* one or two words? */
-	movne	r5, r4			/*   one, move 1st word to 2nd word */
-	movne	r4, #0			/*        and clear 1st word */
+	moveq	r5, r4			/*   one, move 1st word to 2nd word */
+	moveq	r4, #0			/*        and clear 1st word */
 	b	.Lfinal_dword_noload	/* handle final dword */
+
+	/*
+	 * If we had an odd address, we have byte swap the return value.
+	 * instead of testing everywhere, we inserted a fake callframe and
+	 * set LR to return to do the fixup and return to the caller.
+	 */
+.Lmisaligned_fixup:
+	ldr	lr, [sp], #8		/* fetch saved LR */
+#ifdef _ARM_ARCH_6
+	rev16	r0, r0			/* byte swap */
+#else
+	mov	r0, r0, r0, ror #8	/* move 0:7 to 24:31 and 8:15 to 0:7 */
+	orr	r0, r0, r0, lsl #16	/* move 0:7 to 16:23 */
+	mov	r0, r0, r0, lsr #16	/* clear 16:31 to 0:15 */
+#endif
+	RET
 END(cpu_in_cksum_buffer)

Reply via email to