https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125880
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Blueprint for testcases not requiring vectorization:
typedef short v8hi __attribute__((vector_size(16)));
v8hi foo (short *p, short *q, short *r, short *w, int s)
{
return (v8hi){*p, *q, *r, *w, *(p+s), *(q+s), *(r+s), *(w+s)};
}
currently with -march=znver5 gets you
foo:
.LFB0:
.cfi_startproc
movzwl (%rdi), %eax
movslq %r8d, %r8
vmovd %eax, %xmm0
movzwl (%rdx), %eax
vpinsrw $1, (%rsi), %xmm0, %xmm0
vmovd %eax, %xmm3
movzwl (%rdi,%r8,2), %eax
vpinsrw $1, (%rcx), %xmm3, %xmm3
vmovd %eax, %xmm1
movzwl (%rdx,%r8,2), %eax
vpinsrw $1, (%rsi,%r8,2), %xmm1, %xmm1
vpunpckldq %xmm3, %xmm0, %xmm0
vmovd %eax, %xmm2
vpinsrw $1, (%rcx,%r8,2), %xmm2, %xmm2
vpunpckldq %xmm2, %xmm1, %xmm1
vpunpcklqdq %xmm1, %xmm0, %xmm0