Module Name:    src
Committed By:   matt
Date:           Mon Dec 31 07:58:44 UTC 2012

Modified Files:
        src/common/lib/libc/arch/arm/string: strlen_armv6.S

Log Message:
Make this work on all ARMs but keep the armv6 optimizations.  It as fast as
the existing strlen for small string and once strings are 8 bytes or more in
length it starts getting significantly faster.  For really long strings,
compared to the existing strlen, this uses about 1/2 of the cycles for the
non-armv6 version and about 1/3 of the cycles for the armv6 version.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/common/lib/libc/arch/arm/string/strlen_armv6.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/common/lib/libc/arch/arm/string/strlen_armv6.S
diff -u src/common/lib/libc/arch/arm/string/strlen_armv6.S:1.2 src/common/lib/libc/arch/arm/string/strlen_armv6.S:1.3
--- src/common/lib/libc/arch/arm/string/strlen_armv6.S:1.2	Sat Dec 29 05:36:57 2012
+++ src/common/lib/libc/arch/arm/string/strlen_armv6.S	Mon Dec 31 07:58:44 2012
@@ -29,37 +29,44 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: strlen_armv6.S,v 1.2 2012/12/29 05:36:57 matt Exp $")
+RCSID("$NetBSD: strlen_armv6.S,v 1.3 2012/12/31 07:58:44 matt Exp $")
 
-	.text
-ENTRY(strlen)
-	add	ip, r0, #4		/* for the final post-inc */
-	ands	r1, r0, #3		/* get misalignment */
-	bic	r0, r0, #3		/* align to word boundary */
-	ldr	r3, [r0], #4		/* load first word */
-	beq	.Lpre_main_loop		/*   misaligned?  no, go to loop */
-	/*
-	 * For misaligned string, we need to make sure that the bytes before
-	 * the start of the string will not cause a false match to a NUL.
-	 */
-	mvn	r2, #0			/* create a mask */
-	mov	r1, r1, lsl #3		/* bytes -> bits */
 #ifdef __ARMEL__
-	mov	r2, r2, lsl r1		/* clear relavent bytes */
+#define	BYTE0	0x000000ff
+#define	BYTE1	0x0000ff00
+#define	BYTE2	0x00ff0000
+#define	BYTE3	0xff000000
 #else
-	mov	r2, r2, lsr r1		/* clear relavent bytes */
+#define	BYTE0	0xff000000
+#define	BYTE1	0x00ff0000
+#define	BYTE2	0x0000ff00
+#define	BYTE3	0x000000ff
 #endif
-	mvn	r2, r2			/* invert mask */
-	orr	r3, r3, r2		/* orr in mask for leading bytes */
+
+	.text
+ENTRY(strlen)
+	add	ip, r0, #4		/* for the final post-inc */
+1:	tst	r0, #3			/* test for word alignment */
+	beq	.Lpre_main_loop		/*   finally word aligned */
+	ldrb	r3, [r0], #1		/* load a byte */
+	teq	r3, #0			/* is it 0? */
+	bne	1b			/*   no, try next byte */
+	sub	ip, ip, #3		/* subtract (4 - the NUL) */
+	sub	r0, r0, ip		/* subtract start */
+	RET				/* return */
 .Lpre_main_loop:
-#ifdef _ARM_ARCH_7
+#if defined(_ARM_ARCH_6)
+#if defined(_ARM_ARCH_7)
 	movw	r1, #0xfefe		/* magic constant; 254 in each byte */
 #else
 	mov	r1, #0xfe		/* put 254 in low byte */
 	orr	r1, r1, r1, lsl #8	/* move to next byte */
 #endif
 	orr	r1, r1, r1, lsl #16	/* move to next halfword */
+#endif /* _ARM_ARCH_6 */
 .Lmain_loop:
+	ldr	r3, [r0], #4		/* load next word */
+#if defined(_ARM_ARCH_6)
 	/*
 	 * Add 254 to each byte using the UQADD8 (unsigned saturating add 8)
 	 * instruction.  For every non-NUL byte, the result for that byte will
@@ -67,24 +74,47 @@ ENTRY(strlen)
 	 * result, if the result is non-0 then we must have encountered a NUL.
 	 */
 	uqadd8	r3, r3, r1		/* magic happens here */
-	mvns	r3, r3			/* is the complemented result 0? */
-	bne	.Lreturn		/*    no, then we encountered a NUL */
-	ldr	r3, [r0], #4		/* load next word */
-	b	.Lmain_loop		/* and go */
+	mvns	r3, r3			/* is the complemented result non-0? */
+	beq	.Lmain_loop		/*    no, then we encountered no NULs */
+#else
+	/*
+	 * No fancy shortcuts so just test each byte lane for a NUL.
+	 * (other tests for NULs in a word take more instructions/cycles).
+	 */
+	tst	r3, #BYTE0		/* is this byte 0? */
+	tstne	r3, #BYTE1		/*   no, is this byte 0? */
+	tstne	r3, #BYTE2		/*   no, is this byte 0? */
+	tstne	r3, #BYTE3		/*   no, is this byte 0? */
+	bne	.Lmain_loop		/*   no, then get next word */
+#endif
+#if defined(_ARM_ARCH_6)
 	/*
 	 * We encountered a NUL.  Find out where by doing a CLZ and then
 	 * shifting right by 3.  That will be the number of non-NUL bytes.
 	 */
-.Lreturn:
 #ifdef __ARMEL__
 	rev	r3, r3			/* we want this in BE for the CLZ */
 #endif
 	clz	r3, r3			/* count how many leading zeros */
 	add	r0, r0, r3, lsr #3	/* divide that by 8 and add to count */
+#else
+	/*
+	 * We encountered a NUL.
+	 */
+	tst	r3, #BYTE0		/* 1st byte was NUL? */
+	beq	1f			/*   yes, done adding */
+	add	r0, r0, #1		/* we have one more non-NUL byte */
+	tst	r3, #BYTE1		/* 2nd byte was NUL? */
+	beq	1f			/*   yes, done adding */
+	add	r0, r0, #1		/* we have one more non-NUL byte */
+	tst	r3, #BYTE2		/* 3rd byte was NUL? */
+	addne	r0, r0, #1		/* no, we have one more non-NUL byte */
+1:
+#endif /* _ARM_ARCH_6 */
 	/*
-	 * r0 now points to 4 past the NUL due to the post-inc.  Subtract
-	 * the start of the string (which also has 4 added to it to compensate
-	 * for the post-inc.
+	 * r0 now points to 4 past the NUL due to the post-inc.  Subtract the
+	 * start of the string (which also has 4 added to it to compensate for
+	 * the post-inc.
 	 */
 	sub	r0, r0, ip		/* subtract start to get length */
 	RET

Reply via email to