https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85052

--- Comment #6 from Devin Hussey <husseydevin at gmail dot com> ---
The patch seems to be working.

typedef unsigned u32x2 __attribute__((vector_size(8)));
typedef unsigned long long u64x2 __attribute__((vector_size(16)));

u64x2 cvt(u32x2 in)
{
    return __builtin_convertvector(in, u64x2);
}

It doesn't generate the best code, but it isn't bad.

x86_64, SSE4.1:

cvt:
        movq    %xmm0, %rax
        movd    %eax, %xmm0
        shrq    $32, %rax
        pinsrq  $1, %rax, %xmm0
        ret

x86_64, SSE2:

cvt:
        movq    %xmm0, %rax
        movd    %eax, %xmm0
        shrq    $32, %rax
        movq    %rax, %xmm1
        punpcklqdq      %xmm1, %xmm0
        ret

ARMv7a NEON:

cvt:
        sub     sp, sp, #16
        mov     r3, #0
        str     r3, [sp, #4]
        str     r3, [sp, #12]
        add     r3, sp, #8
        vst1.32 {d0[0]}, [sp]
        vst1.32 {d0[1]}, [r3]
        vld1.64 {d0-d1}, [sp:64]
        add     sp, sp, #16
        bx      lr

I haven't built the others yet.

The correct code would be this ([signed|unsigned]):

cvt:
    vmovl.[s|u]32    q0, d0
    bx lr

I am testing other targets now. 

For the reference, this is what clang generates for other targets:

aarch64:

cvt:
        [s|u]shll   v0.2d, v0.2s, #0
        ret

sse4.1/avx:

cvt:
        [v]pmov[s|z]xdq        xmm0, xmm0
        ret

sse2:

signed_cvt:
        pxor    xmm1, xmm1
        pcmpgtd xmm1, xmm0
        punpckldq       xmm0, xmm1      # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1]
        ret

unsigned_cvt:
        xorps   xmm1, xmm1
        unpcklps        xmm0, xmm1      # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1]
        ret

Reply via email to