[Bug tree-optimization/92645] Hand written vector code is 450 times slower when compiled with GCC compared to Clang

amonakov at gcc dot gnu.org Mon, 25 Nov 2019 05:52:24 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92645


Alexander Monakov <amonakov at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amonakov at gcc dot gnu.org

--- Comment #8 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #5)
> 
> "extracting" the actual loops (inlined and all) in intrinsic form as a C
> testcase would be really really nice.

Something like the following?  Enjoy!

typedef unsigned int u32v4 __attribute__((vector_size(16)));
typedef unsigned short u16v16 __attribute__((vector_size(32)));
typedef unsigned char u8v16 __attribute__((vector_size(16)));

union vec128 {
  u8v16 u8;
  u32v4 u32;
};

#define memcpy __builtin_memcpy

u16v16 zxt(u8v16 x)
{
  return (u16v16) {
    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
  };
}

u8v16 narrow(u16v16 x)
{
  return (u8v16) {
    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
  };
}

void f(char *dst, char *src, unsigned long n, unsigned c)
{
  unsigned ia = 255 - (c >> 24);
  ia += ia >> 7;

  union vec128 c4 = {0}, ia16 = {0};
  c4.u32 += c;
  ia16.u8 += (unsigned char)ia;

  u16v16 c16 = (zxt(c4.u8) << 8) + 128;

  for (; n; src += 16, dst += 16, n -= 4) {
    union vec128 s;
    memcpy(&s, src, sizeof s);
    s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
    memcpy(dst, &s, sizeof s);
  }
}

[Bug tree-optimization/92645] Hand written vector code is 450 times slower when compiled with GCC compared to Clang

Reply via email to