https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123631
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Gemini comes up with
#include <stdio.h>
#include <stdint.h>
#define ITERATIONS 100000000
static inline uint64_t rdtsc() {
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
}
int main() {
uint64_t start, end;
float dummy_res[4];
// Constant for the memory broadcast
static const uint32_t CST = 0x0000000A;
printf("Running benchmarks (%d iterations)...\n", ITERATIONS);
// --- Test 1: GPR to XMM Path ---
start = rdtsc();
for (int i = 0; i < ITERATIONS; i++) {
__asm__ __volatile__ (
"mov $0xa, %%eax\n\t"
"vmovd %%eax, %%xmm0\n\t"
"vpbroadcastd %%xmm0, %%xmm0\n\t"
"vpaddd %%xmm0, %%xmm1, %%xmm1\n\t" // Link xmm0 to the accumulator
: : : "eax", "xmm0", "xmm1"
);
}
end = rdtsc();
printf("GPR Path: %lu cycles total (~%.2f cycles/iter)\n",
(end - start), (double)(end - start) / ITERATIONS);
// --- Test 2: Memory Path ---
start = rdtsc();
for (int i = 0; i < ITERATIONS; i++) {
__asm__ __volatile__ (
"vbroadcastss %0, %%xmm0\n\t"
"vpaddd %%xmm0, %%xmm1, %%xmm1\n\t" // Link xmm0 to the accumulator
: : "m"(CST) : "xmm0", "xmm1"
);
}
end = rdtsc();
printf("Memory Path: %lu cycles total (~%.2f cycles/iter)\n",
(end - start), (double)(end - start) / ITERATIONS);
return 0;
}
which on Zen2 shows
GPR Path: 126567740 cycles total (~1.27 cycles/iter)
Memory Path: 83913348 cycles total (~0.84 cycles/iter)
this obviously assumes in L1-cache data. I'm unsure whether LRA would
ever re-materialize vector constants rather than spilling/reloading,
but I expect us to hoist any such initialization out of loops and
the non-memory variant consumes an extra GPR.