https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71414

--- Comment #4 from Yichao Yu <yyc1992 at gmail dot com> ---
The C code is in the gist linked `a` is a cacheline aligned pointer and `n` is
1024 so `a` should even fits in L1d, which is 32kB on both processors I
benchmarked.

More precise timing (ns per loop)

6700K

```
% ./benchmark-gcc           
80.553456
% ./benchmark-clang37 
28.222281
% ./benchmark-clang38 
41.782532
```

4702HQ

```
% ./benchmark-gcc 
140.744893
% ./benchmark-clang37 
50.835441
% ./benchmark-clang38
70.220946
```

Pasting the whole program over for completeness.
The alignment line gives some weird timing on clang without `-mcore-avx2` but
doesn't change anything too much with `-Ofast -mcore-avx2`

```
//

#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <stdio.h>
#include <string.h>

uint64_t gettime_ns()
{
    struct timespec t;
    clock_gettime(CLOCK_MONOTONIC, &t);
    return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
}


__attribute__((noinline)) float sum32(float *a, size_t n)
{
    /* a = (float*)__builtin_assume_aligned(a, 64); */
    float s = 0;
    for (size_t i = 0;i < n;i++)
        s += a[i];
    __asm__ volatile ("" ::: "memory");
    return s;
}

int main()
{
    float *p = aligned_alloc(64, sizeof(float) * 1024);
    memset(p, 0, sizeof(float) * 1024);
    uint64_t start = gettime_ns();
    for (int i = 0;i < 1024 * 1024;i++)
        sum32(p, 1024);
    free(p);
    uint64_t end = gettime_ns();
    printf("%f\n", (end - start) / (1024.0 * 1024.0));
    return 0;
}
```

Reply via email to