https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103571

--- Comment #18 from Hongtao.liu <crazylht at gmail dot com> ---
codegen for foo1/foo2 is suboptimal under -mavx2, i guess we can have
vec_setv16hf_0 and with vpblendw.

typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32)));
typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));

__m256h
__attribute__ ((noinline, noclone))
foo1 (_Float16 x)
{
  return __extension__ (__m256h)(__v16hf) { x, 0.0f, 0.0f, 0.0f,
                                            0.0f, 0.0f, 0.0f, 0.0f,
                                            0.0f, 0.0f, 0.0f, 0.0f,
                                            0.0f, 0.0f, 0.0f, 0.0f };
}

__m256h
__attribute__ ((noinline, noclone))
foo2 (_Float16 *x)
{
  return __extension__ (__m256h)(__v16hf) { *x, 0.0f, 0.0f, 0.0f,
                                            0.0f, 0.0f, 0.0f, 0.0f,
                                            0.0f, 0.0f, 0.0f, 0.0f,
                                            0.0f, 0.0f, 0.0f, 0.0f };
}


foo1:
.LFB0:
        .cfi_startproc
        vpxor   %xmm1, %xmm1, %xmm1
        vpbroadcastw    %xmm0, %ymm0
        vpblendw        $1, %ymm0, %ymm1, %ymm0
        vpblendd        $15, %ymm0, %ymm1, %ymm1
        vmovdqa %ymm1, %ymm0
        ret
        .cfi_endproc
.LFE0:
        .size   foo1, .-foo1
        .p2align 4
        .globl  foo2
        .type   foo2, @function
foo2:
.LFB1:
        .cfi_startproc
        vpbroadcastw    (%rdi), %ymm1
        vpxor   %xmm0, %xmm0, %xmm0
        vpblendw        $1, %ymm1, %ymm0, %ymm1
        vpblendd        $15, %ymm1, %ymm0, %ymm0
        ret
        .cfi_endproc

Reply via email to