https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> --- If I make the arrays random then GCC code is indeed faster: #include <math.h> #include <malloc.h> typedef float real_t; #define iterations 1000000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; real_t aa[LEN_2D][LEN_2D]; real_t bb[LEN_2D][LEN_2D]; real_t cc[LEN_2D][LEN_2D]; real_t qq; int main(void) { // reductions // if to max reduction real_t x; for (int i = 0; i < LEN_1D; i++) { a[i]=(rand() %5) - 3; b[i]=(rand() %6) - 3; } for (int nl = 0; nl < iterations; nl++) { for (int i = 0; i < LEN_1D; i++) { if (a[i] < (real_t)0.) { if (b[i] > a[i]) { c[i] += d[i] * e[i]; } } } //dummy(a, b, c, d, e, aa, bb, cc, 0.); } return x; } jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast s1279.c -march=native s1279.c:23:14: warning: implicit declaration of function 'rand' is invalid in C99 [-Wimplicit-function-declaration] a[i]=(rand() %5) - 3; ^ 1 warning generated. jh@alberti:~/tsvc/bin> time ./a.out real 0m5.638s user 0m5.636s sys 0m0.000s jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast s1279.c -march=native s1279.c: In function 'main': s1279.c:23:14: warning: implicit declaration of function 'rand' [-Wimplicit-function-declaration] 23 | a[i]=(rand() %5) - 3; | ^~~~ jh@alberti:~/tsvc/bin> time ./a.out real 0m2.791s user 0m2.790s sys 0m0.000s sorry for wrong code, just for reference the loop compiles as: .L4: xorl %eax, %eax .p2align 4 .p2align 3 .L3: vmovaps a(%rax), %ymm2 vmovaps b(%rax), %ymm3 vmovaps c(%rax), %ymm6 addq $32, %rax vmovaps c-32(%rax), %ymm0 vmovaps e-32(%rax), %ymm4 vcmpps $1, %ymm1, %ymm2, %k1 vcmpps $14, %ymm2, %ymm3, %k1{%k1} vfmadd231ps d-32(%rax), %ymm4, %ymm0{%k1} vfmadd231ps d-32(%rax), %ymm4, %ymm0 vblendmps %ymm0, %ymm6, %ymm0{%k1} vmovaps %ymm0, c-32(%rax) cmpq $128000, %rax jne .L3 decl %edx jne .L4