https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103753

            Bug ID: 103753
           Summary: Unoptimal avx2 V16HF vector insert to element 0
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ubizjak at gmail dot com
  Target Milestone: ---

(Cloned from PR103571#18)

Following testcase:

--cut here--
typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32)));

__v16hf foo (_Float16 x)
{
  return (__v16hf) { x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
}
--cut here--

compiles with -O2 -mavx2 to:

        vpxor   %xmm1, %xmm1, %xmm1
        vpbroadcastw    %xmm0, %ymm0
        vpblendw        $1, %ymm0, %ymm1, %ymm0
        vpblendd        $15, %ymm0, %ymm1, %ymm1
        vmovdqa %ymm1, %ymm0
        ret

while similar version with 16bit integer:

--cut here--
typedef short __v16hi __attribute__ ((__vector_size__ (32)));

__v16hi bar (short x)
{
  return (__v16hi) { x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
}
--cut here--

compiles to much shorter:

        vpxor   %xmm1, %xmm1, %xmm1
        vpinsrw $0, %edi, %xmm1, %xmm0
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
        ret

Please also note that with -O2 -mavx, the _Float16 version compiles to optimal:

        vpxor   %xmm1, %xmm1, %xmm1
        vpblendw        $1, %xmm0, %xmm1, %xmm0
        vinsertf128     $0x1, %xmm1, %ymm0, %ymm0
        ret

Reply via email to