https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
Uroš Bizjak <ubizjak at gmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |pan2.li at intel dot com
Target Milestone|--- |15.0
--- Comment #18 from Uroš Bizjak <ubizjak at gmail dot com> ---
Implemented in gcc-15.(In reply to Tamar Christina from comment #5)
> Yeah, this is hurting us a lot on vectors as well:
>
> https://godbolt.org/z/ecnGadxcG
Similar testcase:
--cut here--
unsigned short
add_sat(unsigned short x, unsigned short y)
{
unsigned short z;
if (!__builtin_add_overflow(x, y, &z))
return z;
return -1u;
}
unsigned short
add_sat2(unsigned short x, unsigned short y)
{
unsigned short res;
res = x + y;
res |= -(res < x);
return res;
}
void f(unsigned short * restrict a, unsigned short * restrict b)
{
for (int i = 0; i < 8; i++)
{
b[i] = add_sat (a[i], b[i]);
}
}
void f2(unsigned short * restrict a, unsigned short * restrict b)
{
for (int i = 0; i < 8; i++)
{
b[i] = add_sat2 (a[i], b[i]);
}
}
--cut here--
now generates optimal code on x86 (-O2 -ftree-vectorize) for all functions but
f2.
add_sat:
addw %di, %si
movl $-1, %eax
cmovnc %esi, %eax
ret
add_sat2:
addw %di, %si
movl $-1, %eax
cmovnc %esi, %eax
ret
f:
movdqu (%rsi), %xmm1
movdqu (%rdi), %xmm0
paddusw %xmm1, %xmm0
movups %xmm0, (%rsi)
ret
f2:
movdqu (%rdi), %xmm0
movdqu (%rsi), %xmm1
pxor %xmm2, %xmm2
paddw %xmm0, %xmm1
psubusw %xmm1, %xmm0
pcmpeqw %xmm2, %xmm0
pcmpeqw %xmm2, %xmm0
por %xmm1, %xmm0
movups %xmm0, (%rsi)
Adding CC for the missing vectorization problem. clang is able to produce the
same assembly for f2 as for f.