On Tue, Jul 30, 2024 at 8:43 AM Andi Kleen <a...@linux.intel.com> wrote:
>
> From: Andi Kleen <a...@gcc.gnu.org>
>
> AVX2 is widely available on x86 and it allows to do the scanner line
> check with 32 bytes at a time. The code is similar to the SSE2 code
> path, just using AVX and 32 bytes at a time instead of SSE2 16 bytes.
>
> Also adjust the code to allow inlining when the compiler
> is built for an AVX2 host, following what other architectures
> do.
>
> I see about a ~0.6% compile time improvement for compiling i386
> insn-recog.i with -O0.
>
> libcpp/ChangeLog:
>
>         * config.in (HAVE_AVX2): Add.
>         * configure: Regenerate.
>         * configure.ac: Add HAVE_AVX2 check.
>         * lex.cc (repl_chars): Extend to 32 bytes.
>         (search_line_avx2): New function to scan line using AVX2.
>         (init_vectorized_lexer): Check for AVX2 in CPUID.
> ---
>  libcpp/config.in    |  3 ++
>  libcpp/configure    | 17 +++++++++
>  libcpp/configure.ac |  3 ++
>  libcpp/lex.cc       | 91 +++++++++++++++++++++++++++++++++++++++++++--
>  4 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/libcpp/config.in b/libcpp/config.in
> index 253ef03a3dea..8fad6bd4b4f5 100644
> --- a/libcpp/config.in
> +++ b/libcpp/config.in
> @@ -213,6 +213,9 @@
>  /* Define to 1 if you can assemble SSE4 insns. */
>  #undef HAVE_SSE4
>
> +/* Define to 1 if you can assemble AVX2 insns. */
> +#undef HAVE_AVX2
> +
>  /* Define to 1 if you have the <stddef.h> header file. */
>  #undef HAVE_STDDEF_H
>
> diff --git a/libcpp/configure b/libcpp/configure
> index 32d6aaa30699..6d9286ac9601 100755
> --- a/libcpp/configure
> +++ b/libcpp/configure
> @@ -9149,6 +9149,23 @@ if ac_fn_c_try_compile "$LINENO"; then :
>
>  $as_echo "#define HAVE_SSE4 1" >>confdefs.h
>
> +fi
> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
> +    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
> +/* end confdefs.h.  */
> +
> +int
> +main ()
> +{
> +asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))
> +  ;
> +  return 0;
> +}
> +_ACEOF
> +if ac_fn_c_try_compile "$LINENO"; then :
> +
> +$as_echo "#define HAVE_AVX2 1" >>confdefs.h
> +
>  fi
>  rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>  esac
> diff --git a/libcpp/configure.ac b/libcpp/configure.ac
> index b883fec776fe..c06609827924 100644
> --- a/libcpp/configure.ac
> +++ b/libcpp/configure.ac
> @@ -200,6 +200,9 @@ case $target in
>      AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))],
>        [AC_DEFINE([HAVE_SSE4], [1],
>                  [Define to 1 if you can assemble SSE4 insns.])])
> +    AC_TRY_COMPILE([], [asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))],
> +      [AC_DEFINE([HAVE_AVX2], [1],
> +                [Define to 1 if you can assemble AVX2 insns.])])
>  esac
>
>  # Enable --enable-host-shared.
> diff --git a/libcpp/lex.cc b/libcpp/lex.cc
> index 1591dcdf151a..72f3402aac99 100644
> --- a/libcpp/lex.cc
> +++ b/libcpp/lex.cc
> @@ -278,19 +278,31 @@ search_line_acc_char (const uchar *s, const uchar *end 
> ATTRIBUTE_UNUSED)
>  /* Replicated character data to be shared between implementations.
>     Recall that outside of a context with vector support we can't
>     define compatible vector types, therefore these are all defined
> -   in terms of raw characters.  */
> -static const char repl_chars[4][16] __attribute__((aligned(16))) = {
> +   in terms of raw characters.
> +   gcc constant propagates this and usually turns it into a
> +   vector broadcast, so it actually disappears.  */
> +
> +static const char repl_chars[4][32] __attribute__((aligned(32))) = {
>    { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
> +    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
> +    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
>      '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
>    { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
> +    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
> +    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
>      '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
>    { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
> +    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
> +    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
>      '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
>    { '?', '?', '?', '?', '?', '?', '?', '?',
> +    '?', '?', '?', '?', '?', '?', '?', '?',
> +    '?', '?', '?', '?', '?', '?', '?', '?',
>      '?', '?', '?', '?', '?', '?', '?', '?' },
>  };
>
>
> +#ifndef __AVX2__
>  /* A version of the fast scanner using SSE2 vectorized byte compare insns.  
> */
>
>  static const uchar *
> @@ -343,8 +355,9 @@ search_line_sse2 (const uchar *s, const uchar *end 
> ATTRIBUTE_UNUSED)
>    found = __builtin_ctz(found);
>    return (const uchar *)p + found;
>  }
> +#endif
>
> -#ifdef HAVE_SSE4
> +#if defined(HAVE_SSE4) && !defined(__AVX2__)
>  /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
>
>  static const uchar *
> @@ -425,6 +438,71 @@ search_line_sse42 (const uchar *s, const uchar *end)
>  #define search_line_sse42 search_line_sse2
>  #endif
>
> +#ifdef HAVE_AVX2
> +
> +/* A version of the fast scanner using AVX2 vectorized byte compare insns.  
> */
> +
> +static const uchar *
> +#ifndef __AVX2__
> +__attribute__((__target__("avx2")))
> +#endif
> +search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
> +{
> +  typedef char v32qi __attribute__ ((__vector_size__ (32)));
> +
> +  const v32qi repl_nl = *(const v32qi *)repl_chars[0];
> +  const v32qi repl_cr = *(const v32qi *)repl_chars[1];
> +  const v32qi repl_bs = *(const v32qi *)repl_chars[2];
> +  const v32qi repl_qm = *(const v32qi *)repl_chars[3];
> +
> +  unsigned int misalign, found, mask;
> +  const v32qi *p;
> +  v32qi data, t;
> +
> +  /* Align the source pointer.  */
> +  misalign = (uintptr_t)s & 31;
> +  p = (const v32qi *)((uintptr_t)s & -32);
> +  data = *p;
> +
> +  /* Create a mask for the bytes that are valid within the first
> +     32-byte block.  The Idea here is that the AND with the mask
> +     within the loop is "free", since we need some AND or TEST
> +     insn in order to set the flags for the branch anyway.  */
> +  mask = -1u << misalign;
> +
> +  /* Main loop processing 32 bytes at a time.  */
> +  goto start;
> +  do
> +    {
> +      data = *++p;
> +      mask = -1;
> +
> +    start:
> +      t  = data == repl_nl;
> +      t |= data == repl_cr;
> +      t |= data == repl_bs;
> +      t |= data == repl_qm;
> +      found = __builtin_ia32_pmovmskb256 (t);

Using the builtin here seems wrong. Why not use the intrinsic
_mm256_movemask_epi8 ?
Oh I noticed that there is other similar builtin uses for the x86 case.
Also it might make sense to remove the MMX version.

Thanks,
Andrew Pinski



> +      found &= mask;
> +    }
> +  while (!found);
> +
> +  /* FOUND contains 1 in bits for which we matched a relevant
> +     character.  Conversion to the byte index is trivial.  */
> +  found = __builtin_ctz (found);
> +  return (const uchar *)p + found;
> +}
> +
> +#else
> +#define search_line_avx2 search_line_sse2
> +#endif
> +
> +#ifdef __AVX2__
> +/* Avoid indirect calls to encourage inlining if the compiler is built
> +   using AVX.  */
> +#define search_line_fast search_line_avx2
> +#else
> +
>  /* Check the CPU capabilities.  */
>
>  #include "../gcc/config/i386/cpuid.h"
> @@ -436,7 +514,7 @@ static search_line_fast_type search_line_fast;
>  static inline void
>  init_vectorized_lexer (void)
>  {
> -  unsigned dummy, ecx = 0, edx = 0;
> +  unsigned dummy, ecx = 0, edx = 0, ebx = 0;
>    search_line_fast_type impl = search_line_acc_char;
>    int minimum = 0;
>
> @@ -448,6 +526,10 @@ init_vectorized_lexer (void)
>
>    if (minimum == 3)
>      impl = search_line_sse42;
> +  else if (__get_cpuid_max (0, &dummy) >= 7
> +              && __get_cpuid_count (7, 0, &dummy, &ebx, &dummy, &dummy)
> +              && (ebx & bit_AVX2))
> +    impl = search_line_avx2;
>    else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
>      {
>        if (minimum == 3 || (ecx & bit_SSE4_2))
> @@ -458,6 +540,7 @@ init_vectorized_lexer (void)
>
>    search_line_fast = impl;
>  }
> +#endif /* !__AVX2__ */
>
>  #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
>
> --
> 2.45.2
>

Reply via email to