Module Name: src
Committed By: dsl
Date: Sat Jul 18 11:41:23 UTC 2009
Modified Files:
src/common/lib/libc/arch/x86_64/string: strchr.S
Log Message:
Replace with a version that:
1) doesn't do byte compares to find which byte matched
2) doesn't do byte compares if any top bits are set
3) doesn't use a loop when the input is misaligned
4) has less mispredicted branches
Passes regression tests and 'build.sh' doesn't explode (and more than usual).
To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/common/lib/libc/arch/x86_64/string/strchr.S
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/common/lib/libc/arch/x86_64/string/strchr.S
diff -u src/common/lib/libc/arch/x86_64/string/strchr.S:1.2 src/common/lib/libc/arch/x86_64/string/strchr.S:1.3
--- src/common/lib/libc/arch/x86_64/string/strchr.S:1.2 Fri Jul 17 19:37:57 2009
+++ src/common/lib/libc/arch/x86_64/string/strchr.S Sat Jul 18 11:41:23 2009
@@ -1,133 +1,153 @@
-/*
- * Written by J.T. Conklin <[email protected]>
- * Public domain.
+/* $NetBSD: strchr.S,v 1.3 2009/07/18 11:41:23 dsl Exp $ */
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by David Laight.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
*/
+/* See comments in strlen.S about checking words for byte values */
+
#include <machine/asm.h>
#if defined(LIBC_SCCS)
- RCSID("$NetBSD: strchr.S,v 1.2 2009/07/17 19:37:57 dsl Exp $")
+ RCSID("$NetBSD: strchr.S,v 1.3 2009/07/18 11:41:23 dsl Exp $")
#endif
-ENTRY(strchr)
- movzbq %sil,%rcx
-
- /*
- * Align to word boundary.
- * Consider unrolling loop?
- */
-.Lalign:
- testb $7,%dil
- je .Lword_aligned
- movb (%rdi),%dl
- cmpb %cl,%dl
- je .Ldone
- incq %rdi
- testb %dl,%dl
- jne .Lalign
- jmp .Lzero
-
-.Lword_aligned:
- /* copy char to all bytes in word */
- movb %cl,%ch
- movq %rcx,%rdx
- salq $16,%rcx
- orq %rdx,%rcx
- movq %rcx,%rdx
- salq $32,%rcx
- orq %rdx,%rcx
+/*
+ * On entry %rdi is the buffer and the low byte of %rsi (%sil) the
+ * character to search for.
+ *
+ * Registers %rdx, %rcx, %r8-%r11 and %rax are also usable
+ */
+/* Uncomment below to get regression test to run this version but
+ * have everything else use the trivial one below. */
+/* #define TEST_STRCHR */
+
+#ifdef TEST_STRCHR
+ENTRY(test_strchr)
+#else
+ENTRY(strchr)
+#endif
movabsq $0x0101010101010101,%r8
- movabsq $0x8080808080808080,%r9
- /* Check whether any byte in the word is equal to ch or 0. */
- _ALIGN_TEXT
-.Lloop:
- movq (%rdi),%rdx
+ movzbq %sil,%rdx /* value to search for (c) */
+ imul $0x80,%r8,%r9 /* 0x8080808080808080 */
+ imul %r8,%rdx /* (c) copied to all bytes */
+ test $7,%dil
+ jnz 20f /* jump if misaligned */
+
+ _ALIGN_TEXT /* one byte nop */
+1:
+ movq (%rdi),%rax /* bytes to check (x) */
+2:
addq $8,%rdi
- movq %rdx,%rsi
- subq %r8,%rdx
- xorq %rcx,%rsi
- subq %r8,%rsi
- orq %rsi,%rdx
- testq %r9,%rdx
- je .Lloop
-
- /*
- * In rare cases, the above loop may exit prematurely. We must
- * return to the loop if none of the bytes in the word match
- * ch or are equal to 0.
- */
-
- movb -8(%rdi),%dl
- cmpb %cl,%dl /* 1st byte == ch? */
- jne 1f
- subq $8,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 1st byte == 0? */
- je .Lzero
-
- movb -7(%rdi),%dl
- cmpb %cl,%dl /* 2nd byte == ch? */
- jne 1f
- subq $7,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 2nd byte == 0? */
- je .Lzero
-
- movb -6(%rdi),%dl
- cmpb %cl,%dl /* 3rd byte == ch? */
- jne 1f
- subq $6,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 3rd byte == 0? */
- je .Lzero
-
- movb -5(%rdi),%dl
- cmpb %cl,%dl /* 4th byte == ch? */
- jne 1f
- subq $5,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 4th byte == 0? */
- je .Lzero
-
- movb -4(%rdi),%dl
- cmpb %cl,%dl /* 5th byte == ch? */
- jne 1f
- subq $4,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 5th byte == 0? */
- je .Lzero
-
- movb -3(%rdi),%dl
- cmpb %cl,%dl /* 6th byte == ch? */
- jne 1f
- subq $3,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 6th byte == 0? */
- je .Lzero
-
- movb -2(%rdi),%dl
- cmpb %cl,%dl /* 7th byte == ch? */
- jne 1f
- subq $2,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 7th byte == 0? */
- je .Lzero
-
- movb -1(%rdi),%dl
- cmpb %cl,%dl /* 8th byte == ch? */
- jne 1f
- subq $1,%rdi
- jmp .Ldone
-1: testb %dl,%dl /* 8th byte == 0? */
- jne .Lloop
-
-.Lzero:
- /* If a ch wasn't found, return 0. */
- xorq %rdi,%rdi
+ mov %rax,%r10
+ mov %rax,%r11 /* for 'char' check */
+ not %r10 /* invert of data (~x) */
+
+ xorq %rdx,%r11 /* convert 'char' test to one for NUL */
+ subq %r8,%rax /* x - 0x10 */
+ movq %r10,%rsi /* ~x */
+ subq %r8,%r11 /* (x ^ c) - 0x10 */
+/*
+ * Here we could check ((x - 0x10) | ((x ^ c) - 0x10)) & 0x80
+ * and short-circuit the case where no top bits are set, and
+ * we continue the loop.
+ * However it needs 3 more clocks that are difficult to interleave
+ * in the existing dependency chain ...
+ */
+ andq %r9,%rax /* (x - 0x10) & 0x80 */
+ xorq %rdx,%rsi /* c ^ ~x == ~(c ^ x) */
+ andq %r9,%r11 /* ((x ^ c) - 0x10) & 0x80 */
+ andq %r10,%rax /* (x - 0x10) & 0x80 & ~x */
+ jne 10f /* jump if string ends */
+ andq %rsi,%r11 /* ((x ^ c) - 0x10) & 0x80 & ~(x ^ c) */
+ je 1b /* jump if no match */
+
+ /* Found char, since LE can use bit scan */
+ bsf %r11,%r11 /* 7, 15, 23 ... 63 */
+8: shr $3,%r11 /* 0, 1, 2 .. 7 */
+ lea -8(%r11,%rdi),%rax
+ ret
-.Ldone:
- movq %rdi,%rax
+/* End of string, check whether char is before NUL */
+ _ALIGN_TEXT /* adds three byte nop */
+10:
+ bsf %rax,%rax /* count to NUL */
+ andq %rsi,%r11 /* check for char in last 8 bytes */
+ je 11f
+ bsf %r11,%r11 /* NUL and char - see which was first */
+ cmp %r11,%rax
+ jae 8b /* return 'found' if same - searching for NUL */
+11: xor %eax,%eax /* char not found */
ret
+
+/* Source misaligned: read aligned word and make low bytes invalid */
+/* I (dsl) think a _ALIGN_TEXT here will slow things down! */
+20:
+ xor %rcx,%rcx
+ mov %rdx,%rsi /* repeated char pattern (c) */
+ sub %dil,%cl /* Convert low address values 1..7 */
+ and $7,%cl /* to 7..1 */
+ and $~7,%dil /* move address to start of word */
+ shl $3,%cl /* now 56, 48 ... 16, 8 */
+ movq (%rdi),%rax /* aligned word containing first data */
+ neg %rsi /* generate ~c (not doesn't set flags) */
+ dec %rsi
+ je 22f /* searching for 0xff */
+21: shr %cl,%rsi /* ~c in low bytes */
+ or %rsi,%rax /* set some bits making low bytes invalid */
+ jmp 2b
+
+/* We are searching for 0xff, so can't use ~pattern for invalid value */
+22:
+ mov %r8,%r10 /* 0x01 pattern */
+ lea (%r8,%r8),%rsi /* 0x02 - bits gets set (above) */
+ not %r10 /* now 0xfe */
+ sar %cl,%r10 /* top bytes 0xff */
+ and %r10,%rax /* clear lsb from unwanted low bytes */
+ jmp 21b
+
+#ifdef TEST_STRCHR
+/* Trivial version for bug-fixing above */
+ENTRY(strchr)
+ movq %rsi,%rdx
+ movq %rdi,%rsi
+1:
+ lodsb
+ cmp %al,%dl
+ je 2f
+ test %al,%al
+ jne 1b
+ xor %eax,%eax
+ ret
+2: lea -1(%rsi),%rax
+ ret
+#endif
+
STRONG_ALIAS(index,strchr)