[Bug middle-end/112600] Failed to optimize saturating addition using __builtin_add_overflow

ubizjak at gmail dot com via Gcc-bugs Sat, 12 Oct 2024 01:35:23 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600


Uroš Bizjak <ubizjak at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |pan2.li at intel dot com
   Target Milestone|---                         |15.0

--- Comment #18 from Uroš Bizjak <ubizjak at gmail dot com> ---
Implemented in gcc-15.(In reply to Tamar Christina from comment #5)
> Yeah, this is hurting us a lot on vectors as well:
> 
> https://godbolt.org/z/ecnGadxcG

Similar testcase:

--cut here--
unsigned short
add_sat(unsigned short x, unsigned short y)
{
    unsigned short z;
    if (!__builtin_add_overflow(x, y, &z))
            return z;
    return -1u;
}

unsigned short
add_sat2(unsigned short x, unsigned short y)
{
    unsigned short res;
    res = x + y;
    res |= -(res < x);
    return res;
}

void f(unsigned short * restrict a, unsigned short * restrict b)
{
    for (int i = 0; i < 8; i++)
      {
        b[i] = add_sat (a[i], b[i]);
      }
}

void f2(unsigned short * restrict a, unsigned short * restrict b)
{
    for (int i = 0; i < 8; i++)
      {
        b[i] = add_sat2 (a[i], b[i]);
      }
}
--cut here--

now generates optimal code on x86 (-O2 -ftree-vectorize) for all functions but
f2.

add_sat:
        addw    %di, %si
        movl    $-1, %eax
        cmovnc  %esi, %eax
        ret

add_sat2:
        addw    %di, %si
        movl    $-1, %eax
        cmovnc  %esi, %eax
        ret

f:
        movdqu  (%rsi), %xmm1
        movdqu  (%rdi), %xmm0
        paddusw %xmm1, %xmm0
        movups  %xmm0, (%rsi)
        ret

f2:
        movdqu  (%rdi), %xmm0
        movdqu  (%rsi), %xmm1
        pxor    %xmm2, %xmm2
        paddw   %xmm0, %xmm1
        psubusw %xmm1, %xmm0
        pcmpeqw %xmm2, %xmm0
        pcmpeqw %xmm2, %xmm0
        por     %xmm1, %xmm0
        movups  %xmm0, (%rsi)

Adding CC for the missing vectorization problem. clang is able to produce the
same assembly for f2 as for f.

[Bug middle-end/112600] Failed to optimize saturating addition using __builtin_add_overflow

Reply via email to