https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114987
--- Comment #3 from Colin Ian King <colin.king at intel dot com> ---
perf report from gcc-13 of stress_vecfp_float_add_16.avx of compute loop:
57.93 │200: vaddps 0xc0(%rsp),%ymm3,%ymm5
11.11 │ vaddps 0xe0(%rsp),%ymm2,%ymm6
0.02 │ vmovaps %ymm5,0x60(%rsp)
2.92 │ mov 0x60(%rsp),%rax
│ mov 0x68(%rsp),%rdx
0.37 │ vmovaps %ymm6,0x40(%rsp)
│ vmovaps %ymm5,0x80(%rsp)
6.30 │ vmovq %rax,%xmm1
4.11 │ mov 0x40(%rsp),%rax
│ vmovdqa 0x90(%rsp),%xmm5
│ vmovaps %ymm6,0xa0(%rsp)
3.27 │ vpinsrq $0x1,%rdx,%xmm1,%xmm1
│ mov 0x48(%rsp),%rdx
│ vmovdqa 0xb0(%rsp),%xmm6
3.22 │ vmovdqa %xmm1,0xc0(%rsp)
0.42 │ vmovq %rax,%xmm0
│ vmovdqa %xmm5,0xd0(%rsp)
6.80 │ vpinsrq $0x1,%rdx,%xmm0,%xmm0
3.52 │ vmovdqa %xmm0,0xe0(%rsp)
│ vmovdqa %xmm6,0xf0(%rsp)
│ sub $0x1,%ecx
│ ↑ jne 200
perf report from gcc-14 of stress_vecfp_float_add_16.avx of compute loop:
65.79 │200: vaddps 0xc0(%rsp),%ymm3,%ymm5
3.26 │ vaddps 0xe0(%rsp),%ymm2,%ymm6
0.00 │ vmovaps %ymm5,0x60(%rsp)
9.25 │ mov 0x60(%rsp),%rax
0.00 │ mov 0x68(%rsp),%rdx
│ vmovaps %ymm6,0x40(%rsp)
│ vmovaps %ymm5,0x80(%rsp)
6.49 │ vmovq %rax,%xmm1
0.00 │ mov 0x40(%rsp),%rax
0.00 │ vmovaps %ymm6,0xa0(%rsp)
3.02 │ vpinsrq $0x1,%rdx,%xmm1,%xmm1
│ mov 0x48(%rsp),%rdx
0.35 │ vmovdqa %xmm1,0xc0(%rsp)
0.68 │ vmovq %rax,%xmm0
0.00 │ vmovdqa 0x90(%rsp),%xmm1
5.18 │ vpinsrq $0x1,%rdx,%xmm0,%xmm0
3.00 │ vmovdqa %xmm0,0xe0(%rsp)
│ vmovdqa 0xb0(%rsp),%xmm0
│ vmovdqa %xmm1,0xd0(%rsp)
│ vmovdqa %xmm0,0xf0(%rsp)
│ sub $0x1,%ecx
2.94 │ ↑ jne 200
