https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719
Bug ID: 107719 Summary: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- jh@alberti:~/tsvc/bin> cat tt5.c #include <math.h> typedef double real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; real_t qq; int main(void) { // reductions // maximum of absolute value real_t max; for (int nl = 0; nl < iterations*4; nl++) { max = fabs(a[0]); for (int i = 0; i < LEN_1D; i++) { if ((fabs(a[i])) > max) { max = fabs(a[i]); } } qq += max; } return max; } jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc -Ofast -march=native tt5.c ; perf stat ./a.out Performance counter stats for './a.out': 913.92 msec task-clock:u # 0.999 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 108 page-faults:u # 118.172 /sec 3,342,731,634 cycles:u # 3.658 GHz (83.37%) 15,353 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.37%) 12,484 stalled-cycles-backend:u # 0.00% backend cycles idle (83.38%) 7,989,930,772 instructions:u # 2.39 insn per cycle # 0.00 stalled cycles per insn (83.37%) 1,597,552,117 branches:u # 1.748 G/sec (83.37%) 401,094 branch-misses:u # 0.03% of all branches (83.13%) 0.914933333 seconds time elapsed 0.914630000 seconds user 0.000000000 seconds sys jh@alberti:~/tsvc/bin> gcc -Ofast -march=native tt5.c ; perf stat ./a.out Performance counter stats for './a.out': 880.97 msec task-clock:u # 0.999 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 110 page-faults:u # 124.862 /sec 3,218,698,288 cycles:u # 3.654 GHz (83.21%) 11,566 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.21%) 12,185 stalled-cycles-backend:u # 0.00% backend cycles idle (83.21%) 7,989,544,164 instructions:u # 2.48 insn per cycle # 0.00 stalled cycles per insn (83.48%) 1,597,229,244 branches:u # 1.813 G/sec (83.66%) 401,157 branch-misses:u # 0.03% of all branches (83.23%) 0.881919601 seconds time elapsed 0.881627000 seconds user 0.000000000 seconds sys It is off-noise. GCC 7.5 does: main: .LFB0: .cfi_startproc vmovsd a(%rip), %xmm4 vmovsd qq(%rip), %xmm3 movl $400000, %ecx movl $a+256000, %edx vmovapd .LC1(%rip), %zmm2 vandps .LC0(%rip), %xmm4, %xmm4 vbroadcastsd %xmm4, %zmm4 .p2align 4,,15 .L3: movl $a, %eax vmovapd %zmm4, %zmm0 .p2align 4,,15 .L2: vandpd (%rax), %zmm2, %zmm1 addq $64, %rax vmaxpd %zmm1, %zmm0, %zmm0 cmpq %rax, %rdx jne .L2 vshufi32x4 $78, %zmm0, %zmm0, %zmm1 decl %ecx vmaxpd %zmm0, %zmm1, %zmm0 vshufi32x4 $77, %zmm0, %zmm0, %zmm1 vmaxpd %zmm0, %zmm1, %zmm1 vpshufd $254, %zmm1, %zmm0 vmaxpd %zmm1, %zmm0, %zmm0 vaddsd %xmm0, %xmm3, %xmm3 jne .L3 vmovsd %xmm3, qq(%rip) vcvttsd2si %xmm0, %eax vzeroupper ret .cfi_endproc while trunk main: .LFB0: .cfi_startproc vmovsd a(%rip), %xmm4 vmovsd qq(%rip), %xmm3 movl $400000, %ecx movl $a+256000, %edx vandpd .LC0(%rip), %xmm4, %xmm4 vbroadcastsd .LC2(%rip), %zmm2 vbroadcastsd %xmm4, %zmm4 .p2align 4 .p2align 3 .L3: vmovapd %zmm4, %zmm0 movl $a, %eax .p2align 4 .p2align 3 .L2: vandpd (%rax), %zmm2, %zmm1 addq $64, %rax vmaxpd %zmm1, %zmm0, %zmm0 cmpq %rax, %rdx jne .L2 vextractf64x4 $0x1, %zmm0, %ymm1 decl %ecx vmaxpd %ymm0, %ymm1, %ymm0 vextractf64x2 $0x1, %ymm0, %xmm1 vmaxpd %xmm0, %xmm1, %xmm1 vunpckhpd %xmm1, %xmm1, %xmm0 vmaxpd %xmm1, %xmm0, %xmm0 vaddsd %xmm0, %xmm3, %xmm3 jne .L3 vmovsd %xmm3, qq(%rip) vcvttsd2sil %xmm0, %eax vzeroupper ret .cfi_endproc So no difference in the internal loop @@ -11,67 +11,82 @@ vmovsd qq(%rip), %xmm3 movl $400000, %ecx movl $a+256000, %edx - vmovapd .LC1(%rip), %zmm2 - vandps .LC0(%rip), %xmm4, %xmm4 + vandpd .LC0(%rip), %xmm4, %xmm4 + vbroadcastsd .LC2(%rip), %zmm2 vbroadcastsd %xmm4, %zmm4 - .p2align 4,,15 + .p2align 4 + .p2align 3 .L3: - movl $a, %eax vmovapd %zmm4, %zmm0 - .p2align 4,,15 + movl $a, %eax + .p2align 4 + .p2align 3 .L2: vandpd (%rax), %zmm2, %zmm1 addq $64, %rax vmaxpd %zmm1, %zmm0, %zmm0 cmpq %rax, %rdx jne .L2 - vshufi32x4 $78, %zmm0, %zmm0, %zmm1 + vextractf64x4 $0x1, %zmm0, %ymm1 decl %ecx - vmaxpd %zmm0, %zmm1, %zmm0 - vshufi32x4 $77, %zmm0, %zmm0, %zmm1 - vmaxpd %zmm0, %zmm1, %zmm1 - vpshufd $254, %zmm1, %zmm0 - vmaxpd %zmm1, %zmm0, %zmm0 + vmaxpd %ymm0, %ymm1, %ymm0 + vextractf64x2 $0x1, %ymm0, %xmm1 + vmaxpd %xmm0, %xmm1, %xmm1 + vunpckhpd %xmm1, %xmm1, %xmm0 + vmaxpd %xmm1, %xmm0, %xmm0 vaddsd %xmm0, %xmm3, %xmm3 jne .L3 vmovsd %xmm3, qq(%rip) - vcvttsd2si %xmm0, %eax + vcvttsd2sil %xmm0, %eax