https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081
--- Comment #19 from Hongtao Liu <liuhongt at gcc dot gnu.org> ---
(In reply to H.J. Lu from comment #18)
> (In reply to Haochen Jiang from comment #17)
> >
> > For reproduce, not only on ADL, the fix patch showed regression on all
> > Cascade Lake/Ice Lake/Sapphire Rapids with ~2-4% for 511.povary_r with
> > o2_generic_v3.
>
> Can you extract some testcases to show more PUSH and POP?
The original case was a bit more complicated, so I tried to mimic it by writing
a similar.
extern int bar (double* a, double* b, double* c, double* d, double* e);
extern bool foo2 (double* a, double b);
int
foo (double* a, double* b, double *c)
{
int rr = 0;
double d1;
double d2;
if (bar (a, b, c, &d1, &d2)) --- mostly false;
{
if (d1 > 0.0 && d1 < 100.0)
{
c[0] = a[0] + d1 * b[0];
c[1] = a[1] + d1 * b[1];
c[2] = a[2] + d1 * b[2];
if (foo2 (c, d1))
rr = 1;
}
if (d2 > 0.0 && d2 < 100.0)
{
c[0] = a[0] + d2 * b[0];
c[1] = a[1] + d2 * b[1];
c[2] = a[2] + d2 * b[2];
if (foo2 (c, d2))
rr = 1;
}
}
return rr;
}
Before r15-7400
foo:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rdi, %rbp
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movq %rdx, %rbx
subq $40, %rsp
.cfi_def_cfa_offset 64
leaq 16(%rsp), %rcx
leaq 24(%rsp), %r8
movq %rsi, 8(%rsp)
call bar
movl %eax, %edx
testl %eax, %eax
je .L1
vmovsd 16(%rsp), %xmm0
vxorpd %xmm1, %xmm1, %xmm1
movq 8(%rsp), %rsi
vcomisd %xmm1, %xmm0
jbe .L18
vmovsd .LC1(%rip), %xmm1
vcomisd %xmm0, %xmm1
ja .L21
.L18:
xorl %edx, %edx
.L3:
vmovsd 24(%rsp), %xmm0
vxorpd %xmm1, %xmm1, %xmm1
vcomisd %xmm1, %xmm0
jbe .L1
vmovsd .LC1(%rip), %xmm1
vcomisd %xmm0, %xmm1
ja .L22
.L1:
addq $40, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 24
movl %edx, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
after r15-7400
foo:
.LFB0:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
.cfi_offset 13, -16
movq %rsi, %r13
pushq %r12
.cfi_def_cfa_offset 24
.cfi_offset 12, -24
movq %rdi, %r12
pushq %rbp
.cfi_def_cfa_offset 32
.cfi_offset 6, -32
movq %rdx, %rbp
pushq %rbx
.cfi_def_cfa_offset 40
.cfi_offset 3, -40
subq $24, %rsp
.cfi_def_cfa_offset 64
movq %rsp, %rcx
leaq 8(%rsp), %r8
call bar
movl %eax, %ebx
testl %eax, %eax
je .L1
vmovsd (%rsp), %xmm0
vxorpd %xmm1, %xmm1, %xmm1
vcomisd %xmm1, %xmm0
jbe .L18
vmovsd .LC1(%rip), %xmm1
vcomisd %xmm0, %xmm1
ja .L21
.L18:
xorl %ebx, %ebx
.L3:
vmovsd 8(%rsp), %xmm0
vxorpd %xmm1, %xmm1, %xmm1
vcomisd %xmm1, %xmm0
jbe .L1
vmovsd .LC1(%rip), %xmm1
vcomisd %xmm0, %xmm1
ja .L22
.L1:
addq $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 40
movl %ebx, %eax
popq %rbx
.cfi_def_cfa_offset 32
popq %rbp
.cfi_def_cfa_offset 24
popq %r12
.cfi_def_cfa_offset 16
popq %r13
.cfi_def_cfa_offset 8
ret
W/o more usage of callee-saved registers, callee needs to restore them before
exit which is not needed if more caller-saved register are used.