https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> --- This also reproduces with zen4 and double. jh@alberti:~/tsvc/bin> cat tt.c typedef double real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; void main(void) { for (int nl = 0; nl < iterations; nl++) { for (int i = 0; i < LEN_1D-1; i++){ a[i+1] = b[i]+c[i]; b[i] = c[i]*e[i]; d[i] = a[i]*e[i]; } } } jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native tt.c jh@alberti:~/tsvc/bin> time ./a.out real 0m3.590s user 0m3.585s sys 0m0.004s jh@alberti:~/tsvc/bin> clang -Ofast -march=native tt.c tt.c:6:1: warning: return type of 'main' is not 'int' [-Wmain-return-type] void ^ tt.c:6:1: note: change return type to 'int' void ^~~~ int 1 warning generated. jh@alberti:~/tsvc/bin> time ./a.out real 0m1.538s user 0m1.538s sys 0m0.000s gcc generates: │ 60: vmovapd 0x67e080(%rax),%zmm7 0.15 │ vmovapd 0x601080(%rax),%zmm2 1.07 │ add $0x40,%rax │ vaddpd 0x6bc840(%rax),%zmm7,%zmm0 0.00 │ vmovupd %zmm0,0x6fb048(%rax) 11.10 │ vmulpd 0x601040(%rax),%zmm7,%zmm0 9.46 │ vmovapd %zmm0,0x6bc840(%rax) 0.01 │ vmulpd 0x6fb040(%rax),%zmm2,%zmm0 78.20 │ vmovapd %zmm0,0x63f840(%rax) │ cmp $0x3e7c0,%rax │ ↑ jne 60 clang generates: │ nop │ a0: vmovupd (%r9,%rdx,1),%zmm15 0.46 │ vmovupd (%r9,%rdi,1),%zmm19 0.22 │ vmovupd 0x40(%r9,%rdx,1),%zmm16 0.56 │ vmovupd 0x40(%r9,%rdi,1),%zmm22 0.92 │ vmovupd 0x80(%r9,%rdx,1),%zmm17 1.85 │ vmovupd 0x80(%r9,%rdi,1),%zmm21 1.51 │ vaddpd (%r9,%rcx,1),%zmm15,%zmm18 0.84 │ vmulpd %zmm15,%zmm19,%zmm15 0.47 │ vmovupd %zmm15,(%r9,%rcx,1) 3.37 │ vaddpd 0x40(%r9,%rcx,1),%zmm16,%zmm15 0.56 │ vmulpd %zmm16,%zmm22,%zmm16 0.69 │ vmovupd %zmm16,0x40(%r9,%rcx,1) 3.82 │ vmovupd %zmm18,0x8(%r9,%rsi,1) 3.27 │ vmovapd %zmm15,%zmm20 │ vmovupd %zmm15,0x48(%r9,%rsi,1) 3.60 │ vpermt2pd %zmm18,%zmm13,%zmm20 0.47 │ vpermt2pd %zmm14,%zmm13,%zmm18 0.36 │ vmulpd %zmm19,%zmm18,%zmm18 1.07 │ vmulpd %zmm22,%zmm20,%zmm14 1.33 │ vmovupd %zmm18,(%r9,%r8,1) 6.31 │ vmovupd %zmm14,0x40(%r9,%r8,1) 8.02 │ vaddpd 0x80(%r9,%rcx,1),%zmm17,%zmm14 0.53 │ vmovapd %zmm14,%zmm16 0.05 │ vmovupd %zmm14,0x88(%r9,%rsi,1) 3.08 │ vpermt2pd %zmm15,%zmm13,%zmm16 0.41 │ vmulpd %zmm17,%zmm21,%zmm15 0.20 │ vmovupd %zmm15,0x80(%r9,%rcx,1) 1.60 │ vmulpd %zmm21,%zmm16,%zmm15 1.16 │ vmovupd %zmm15,0x80(%r9,%r8,1) 3.13 │ add $0xc0,%r9 │ cmp $0x3e7c0,%r9 0.03 │ ↑ jne a0 So a forward-dependency here?