https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71414
--- Comment #4 from Yichao Yu <yyc1992 at gmail dot com> --- The C code is in the gist linked `a` is a cacheline aligned pointer and `n` is 1024 so `a` should even fits in L1d, which is 32kB on both processors I benchmarked. More precise timing (ns per loop) 6700K ``` % ./benchmark-gcc 80.553456 % ./benchmark-clang37 28.222281 % ./benchmark-clang38 41.782532 ``` 4702HQ ``` % ./benchmark-gcc 140.744893 % ./benchmark-clang37 50.835441 % ./benchmark-clang38 70.220946 ``` Pasting the whole program over for completeness. The alignment line gives some weird timing on clang without `-mcore-avx2` but doesn't change anything too much with `-Ofast -mcore-avx2` ``` // #include <stdlib.h> #include <stdint.h> #include <time.h> #include <stdio.h> #include <string.h> uint64_t gettime_ns() { struct timespec t; clock_gettime(CLOCK_MONOTONIC, &t); return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; } __attribute__((noinline)) float sum32(float *a, size_t n) { /* a = (float*)__builtin_assume_aligned(a, 64); */ float s = 0; for (size_t i = 0;i < n;i++) s += a[i]; __asm__ volatile ("" ::: "memory"); return s; } int main() { float *p = aligned_alloc(64, sizeof(float) * 1024); memset(p, 0, sizeof(float) * 1024); uint64_t start = gettime_ns(); for (int i = 0;i < 1024 * 1024;i++) sum32(p, 1024); free(p); uint64_t end = gettime_ns(); printf("%f\n", (end - start) / (1024.0 * 1024.0)); return 0; } ```