https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105513
--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Alexander Monakov from comment #7) > The second sequence is 3 uops vs 1/2 (issued/executed) uops in first, and on > Haswell and Skylake it ties up port 5 for two cycles. > > Unclear if you're microbenchmarking latency or throughput, but in any case > on Haswell and Skylake you should see a close to 2x difference. I'm counting clocksticks, and thought a load may take more latency. #include <stdio.h> #include <stdlib.h> #include <x86intrin.h> #define LOOP 1000000000 typedef long v2di __attribute__((vector_size(16))); typedef int v4si __attribute__((vector_size(16))); v2di __attribute__ ((noipa)) foo (v2di a) { a[1] = 111113; return a; } void __attribute__ ((noipa)) foo1 (v2di a) { } int main () { int i; unsigned long long start, end; unsigned long long diff; unsigned int aux; start = __rdtscp (&aux); v2di b = __extension__ (v2di){111, 222}; for (i = 0; i < LOOP; i++) { v2di a = foo (b); foo1 (a); } end = __rdtscp (&aux); diff = end - start; printf ("alterna: %lld\n", diff); return 0; }