On Fri, Jul 10, 2015 at 12:43:48PM +0200, Jakub Jelinek wrote:
On Fri, Jul 10, 2015 at 11:37:18AM +0200, Uros Bizjak wrote:
Have you tried new SSE4.2 implementation (the one with asm flags) with
unrolled loop?
Also, the SSE4.2 implementation looks shorter, so more I-cache friendly,
so I wouldn't really say it is redundant if they are roughly same speed.
Ok, I tried to also optimize sse4 and found that main problem was
checking that index==16 caused high latency.
Trick was checking first 64 bytes in header using flags. Then loop is
relatively unlikely as lines longer than 64 bytes are relatively rare.
I tested that on more machines. On haswell sse4 is noticable faster, on
nehalem a sse2 is still bit faster and on amd fx10 its lot slower. How
do I check processor to select sse2 on amd processors where its
considerably slower?
nehalem
real0m12.091s
user0m12.088s
sys 0m0.000s
real0m11.350s
user0m11.347s
sys 0m0.000s
real0m8.521s
user0m8.519s
sys 0m0.000s
real0m10.407s
user0m10.402s
sys 0m0.004s
real0m8.325s
user0m8.323s
sys 0m0.000s
ivy bridge
real0m5.005s
user0m5.004s
sys 0m0.001s
real0m4.149s
user0m4.150s
sys 0m0.000s
real0m3.836s
user0m3.837s
sys 0m0.000s
real0m4.097s
user0m4.098s
sys 0m0.000s
real0m3.928s
user0m3.930s
sys 0m0.000s
fx10
real0m10.456s
user0m10.469s
sys 0m0.000s
real0m9.356s
user0m9.371s
sys 0m0.000s
real0m9.418s
user0m9.433s
sys 0m0.000s
real0m9.039s
user0m9.050s
sys 0m0.003s
real0m7.983s
user0m7.992s
sys 0m0.003s
* libcpp/lex.c (search_line_sse2): Use better header.
(search_line_sse42): Likewise.
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 0ad9660..3bf9eae 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -374,35 +374,107 @@ search_line_sse2 (const uchar *s, const uchar *end
ATTRIBUTE_UNUSED)
unsigned int misalign, found, mask;
const v16qi *p;
- v16qi data, t;
+ v16qi data, t, tx;
+
+ if (s + 80 end)
+{
+ v16qi x0 = __builtin_ia32_loaddqu ((char const *) s);
+ tx = __builtin_ia32_pcmpeqb128 (x0, repl_nl);
+ tx |= __builtin_ia32_pcmpeqb128 (x0, repl_cr);
+ tx |= __builtin_ia32_pcmpeqb128 (x0, repl_bs);
+ tx |= __builtin_ia32_pcmpeqb128 (x0, repl_qm);
+
+ found = __builtin_ia32_pmovmskb128 (tx);
+ if (found)
+{
+ found = __builtin_ctz (found);
+ return (const uchar *) s + found;
+}
+ v16qi x1 = __builtin_ia32_loaddqu ((char const *) (s + 16));
+ v16qi x2 = __builtin_ia32_loaddqu ((char const *) (s + 32));
+ v16qi x3 = __builtin_ia32_loaddqu ((char const *) (s + 48));
+ v16qi x4 = __builtin_ia32_loaddqu ((char const *) (s + 64));
+
+ tx = __builtin_ia32_pcmpeqb128 (x1, repl_nl);
+ tx |= __builtin_ia32_pcmpeqb128 (x1, repl_cr);
+ tx |= __builtin_ia32_pcmpeqb128 (x1, repl_bs);
+ tx |= __builtin_ia32_pcmpeqb128 (x1, repl_qm);
+
+ found = __builtin_ia32_pmovmskb128 (tx);
+
+ if (found)
+{
+ found = __builtin_ctz (found);
+ return (const uchar *) s + 16 + found;
+}
+
+ tx = __builtin_ia32_pcmpeqb128 (x2, repl_nl);
+ tx |= __builtin_ia32_pcmpeqb128 (x2, repl_cr);
+ tx |= __builtin_ia32_pcmpeqb128 (x2, repl_bs);
+ tx |= __builtin_ia32_pcmpeqb128 (x2, repl_qm);
+
+ found = __builtin_ia32_pmovmskb128 (tx);
+
+ if (found)
+{
+ found = __builtin_ctz (found);
+ return (const uchar *) s + 32 + found;
+}
+
+
+ tx = __builtin_ia32_pcmpeqb128 (x3, repl_nl);
+ tx |= __builtin_ia32_pcmpeqb128 (x3, repl_cr);
+ tx |= __builtin_ia32_pcmpeqb128 (x3, repl_bs);
+ tx |= __builtin_ia32_pcmpeqb128 (x3, repl_qm);
+
+ found = __builtin_ia32_pmovmskb128 (tx);
+
+ if (found)
+{
+ found = __builtin_ctz (found);
+ return (const uchar *) s + 48 + found;
+}
+
+ tx = __builtin_ia32_pcmpeqb128 (x4, repl_nl);
+ tx |= __builtin_ia32_pcmpeqb128 (x4, repl_cr);
+ tx |= __builtin_ia32_pcmpeqb128 (x4, repl_bs);
+ tx |= __builtin_ia32_pcmpeqb128 (x4, repl_qm);
+
+ found = __builtin_ia32_pmovmskb128 (tx);
+
+ if (found)
+{
+ found = __builtin_ctz (found);
+ return (const uchar *) s + 64 + found;
+}
+
+ s += 80;
+}
/* Align the source pointer. */
misalign = (uintptr_t)s 15;
p = (const v16qi *)((uintptr_t)s -16);
data = *p;
- /* Create a mask for the bytes that are valid within the first
- 16-byte block. The Idea here is that the AND with the mask
- within the loop is free, since we need some AND or TEST
- insn in order to set the flags for the branch anyway. */
mask = -1u misalign;
- /* Main loop processing 16 bytes at a time. */
- goto start;
- do
+ t = __builtin_ia32_pcmpeqb128(data,