https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103571
--- Comment #18 from Hongtao.liu <crazylht at gmail dot com> --- codegen for foo1/foo2 is suboptimal under -mavx2, i guess we can have vec_setv16hf_0 and with vpblendw. typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32))); typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__)); __m256h __attribute__ ((noinline, noclone)) foo1 (_Float16 x) { return __extension__ (__m256h)(__v16hf) { x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; } __m256h __attribute__ ((noinline, noclone)) foo2 (_Float16 *x) { return __extension__ (__m256h)(__v16hf) { *x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; } foo1: .LFB0: .cfi_startproc vpxor %xmm1, %xmm1, %xmm1 vpbroadcastw %xmm0, %ymm0 vpblendw $1, %ymm0, %ymm1, %ymm0 vpblendd $15, %ymm0, %ymm1, %ymm1 vmovdqa %ymm1, %ymm0 ret .cfi_endproc .LFE0: .size foo1, .-foo1 .p2align 4 .globl foo2 .type foo2, @function foo2: .LFB1: .cfi_startproc vpbroadcastw (%rdi), %ymm1 vpxor %xmm0, %xmm0, %xmm0 vpblendw $1, %ymm1, %ymm0, %ymm1 vpblendd $15, %ymm1, %ymm0, %ymm0 ret .cfi_endproc