https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105513

--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Alexander Monakov from comment #7)
> The second sequence is 3 uops vs 1/2 (issued/executed) uops in first, and on
> Haswell and Skylake it ties up port 5 for two cycles.
> 
> Unclear if you're microbenchmarking latency or throughput, but in any case
> on Haswell and Skylake you should see a close to 2x difference.

I'm counting clocksticks, and thought a load may take more latency.

#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>

#define LOOP 1000000000
typedef long v2di __attribute__((vector_size(16)));
typedef int v4si __attribute__((vector_size(16)));

v2di
__attribute__ ((noipa))
foo (v2di a)
{
        a[1] = 111113;
        return a;
}

void
__attribute__ ((noipa))
foo1 (v2di a)
{
}

int
main ()
{
  int i;
  unsigned long long start, end;
  unsigned long long diff;
  unsigned int aux;

  start = __rdtscp (&aux);
  v2di b = __extension__ (v2di){111, 222};
  for (i = 0; i < LOOP; i++)
    {
      v2di a = foo (b);
      foo1 (a);
    }
  end = __rdtscp (&aux);
  diff = end - start;
  printf ("alterna: %lld\n", diff);

  return 0;
}

Reply via email to