Since the characters we are searching for (CR, LF, '\', '?') all have
distinct ASCII codes mod 16, PSHUFB can help match them all at once.
libcpp/ChangeLog:
* lex.cc (search_line_sse42): Replace with...
(search_line_ssse3): ... this new function. Adjust the use...
(init_vectorized_lexer): ... here.
---
libcpp/lex.cc | 118 ++++++++++++++++++++------------------------------
1 file changed, 46 insertions(+), 72 deletions(-)
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index fa9c03614c..815b8abd29 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -345,84 +345,58 @@ search_line_sse2 (const uchar *s, const uchar *end
ATTRIBUTE_UNUSED)
}
#ifdef HAVE_AVX2
-/* A version of the fast scanner using SSE 4.2 vectorized string insns. */
+/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */
static const uchar *
-#ifndef __SSE4_2__
-__attribute__((__target__("sse4.2")))
+#ifndef __SSSE3__
+__attribute__((__target__("ssse3")))
#endif
-search_line_sse42 (const uchar *s, const uchar *end)
+search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
{
typedef char v16qi __attribute__ ((__vector_size__ (16)));
- static const v16qi search = { '\n', '\r', '?', '\\' };
-
- uintptr_t si = (uintptr_t)s;
- uintptr_t index;
-
- /* Check for unaligned input. */
- if (si & 15)
- {
- v16qi sv;
-
- if (__builtin_expect (end - s < 16, 0)
- && __builtin_expect ((si & 0xfff) > 0xff0, 0))
- {
- /* There are less than 16 bytes left in the buffer, and less
- than 16 bytes left on the page. Reading 16 bytes at this
- point might generate a spurious page fault. Defer to the
- SSE2 implementation, which already handles alignment. */
- return search_line_sse2 (s, end);
- }
-
- /* ??? The builtin doesn't understand that the PCMPESTRI read from
- memory need not be aligned. */
- sv = __builtin_ia32_loaddqu ((const char *) s);
- index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
-
- if (__builtin_expect (index < 16, 0))
- goto found;
-
- /* Advance the pointer to an aligned address. We will re-scan a
- few bytes, but we no longer need care for reading past the
- end of a page, since we're guaranteed a match. */
- s = (const uchar *)((si + 15) & -16);
- }
-
- /* Main loop, processing 16 bytes at a time. */
-#ifdef __GCC_ASM_FLAG_OUTPUTS__
- while (1)
+ typedef v16qi v16qi_u __attribute__ ((__aligned__ (1)));
+ /* Helper vector for pshufb-based matching:
+ each character C we're searching for is at position (C % 16). */
+ v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' };
+ static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63);
+
+ int found;
+ /* Process three 16-byte chunks per iteration. */
+ for (; ; s += 48)
{
- char f;
-
- /* By using inline assembly instead of the builtin,
- we can use the result, as well as the flags set. */
- __asm ("%vpcmpestri\t$0, %2, %3"
- : "=c"(index), "=@ccc"(f)
- : "m"(*s), "x"(search), "a"(4), "d"(16));
- if (f)
- break;
-
- s += 16;
+ v16qi data, t;
+ /* Unaligned load. Reading beyond the final newline is safe, since
+ files.cc:read_file_guts pads the allocation. */
+ data = *(const v16qi_u *)s;
+ /* Prevent propagation into pshufb and pcmp as memory operand. */
+ __asm__ ("" : "+x" (data));
+ t = __builtin_ia32_pshufb128 (lut, data);
+ if ((found = __builtin_ia32_pmovmskb128 (t == data)))
+ goto done;
+ /* Second chunk. */
+ data = *(const v16qi_u *)(s + 16);
+ __asm__ ("" : "+x" (data));
+ t = __builtin_ia32_pshufb128 (lut, data);
+ if ((found = __builtin_ia32_pmovmskb128 (t == data)))
+ goto add_16;
+ /* Third chunk. */
+ data = *(const v16qi_u *)(s + 32);
+ __asm__ ("" : "+x" (data));
+ t = __builtin_ia32_pshufb128 (lut, data);
+ if ((found = __builtin_ia32_pmovmskb128 (t == data)))
+ goto add_32;
}
-#else
- s -= 16;
- /* By doing the whole loop in inline assembly,
- we can make proper use of the flags set. */
- __asm ( ".balign 16\n"
- "0: add $16, %1\n"
- " %vpcmpestri\t$0, (%1), %2\n"
- " jnc 0b"
- : "=&c"(index), "+r"(s)
- : "x"(search), "a"(4), "d"(16));
-#endif
-
- found:
- return s + index;
+add_32:
+ s += 16;
+add_16:
+ s += 16;
+done:
+ return s + __builtin_ctz (found);
}
#else
-/* Work around out-dated assemblers without sse4 support. */
-#define search_line_sse42 search_line_sse2
+/* Work around out-dated assemblers without SSSE3 support. */
+#define search_line_ssse3 search_line_sse2
#endif
/* Check the CPU capabilities. */
@@ -440,18 +414,18 @@ init_vectorized_lexer (void)
search_line_fast_type impl = search_line_acc_char;
int minimum = 0;
-#if defined(__SSE4_2__)
+#if defined(__SSSE3__)
minimum = 3;
#elif defined(__SSE2__)
minimum = 2;
#endif
if (minimum == 3)
- impl = search_line_sse42;
+ impl = search_line_ssse3;
else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
{
- if (minimum == 3 || (ecx & bit_SSE4_2))
- impl = search_line_sse42;
+ if (minimum == 3 || (ecx & bit_SSSE3))
+ impl = search_line_ssse3;
else if (minimum == 2 || (edx & bit_SSE2))
impl = search_line_sse2;
}
--
2.44.0