https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122528
--- Comment #2 from Manuel López-Ibáñez <manu at gcc dot gnu.org> ---
A smaller testcase showing the problem:
int cmp(const double * restrict a, const double * restrict b)
{
bool a_lt_b_0 = a[0] < b[0];
bool a_lt_b_1 = a[1] < b[1];
bool a_lt_b_2 = a[2] < b[2];
bool a_eq_b_0 = a[0] == b[0];
bool a_eq_b_1 = a[1] == b[1];
bool a_eq_b_2 = a[2] == b[2];
bool a_leq_b_0 = a_lt_b_0 | a_eq_b_0;
bool a_leq_b_1 = a_lt_b_1 | a_eq_b_1;
bool a_leq_b_2 = a_lt_b_2 | a_eq_b_2;
if (a_leq_b_0 && a_leq_b_1 && a_leq_b_2)
return -1;
if (a_lt_b_0 || (a_eq_b_0 && a_lt_b_1 || (a_eq_b_1 && a_leq_b_2)))
return 1;
return 0;
}
Compiled with -O3 -march=x86-64-v3 -ffast-math
The above generates 8 vcomisd, but there are only 6 comparisons (and it should
be possible to implement the function with just 3).
"cmp":
vmovsd xmm3, QWORD PTR [rdi+8]
vmovsd xmm2, QWORD PTR [rsi+8]
vmovsd xmm1, QWORD PTR [rdi]
vmovsd xmm0, QWORD PTR [rsi]
vcomisd xmm2, xmm3
vmovsd xmm4, QWORD PTR [rdi+16]
vmovsd xmm5, QWORD PTR [rsi+16]
seta dl
jb .L2
vcomisd xmm0, xmm1
jb .L2
vcomisd xmm5, xmm4
mov eax, -1
jnb .L1
vcomisd xmm0, xmm1
seta al
or eax, edx
movzx eax, al
ret
.L2:
vcomisd xmm0, xmm1
mov eax, 1
ja .L1
vcomisd xmm1, xmm0
jne .L8
test dl, dl
je .L8
.L1:
ret
.L8:
vcomisd xmm3, xmm2
sete dl
xor eax, eax
vcomisd xmm5, xmm4
setnb al
and eax, edx
ret
https://godbolt.org/z/s3v5dcvn6