https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125357
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Assignee|unassigned at gcc dot gnu.org |jakub at gcc dot gnu.org
--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
So I'd go with
--- gcc/config/i386/i386-expand.cc.jj 2026-05-18 09:47:52.032945399 +0200
+++ gcc/config/i386/i386-expand.cc 2026-05-18 11:29:06.397822007 +0200
@@ -5578,7 +5578,7 @@ ix86_expand_vec_perm (rtx operands[])
switch (mode)
{
case E_V16SImode:
- gen =gen_avx512f_permvarv16si;
+ gen = gen_avx512f_permvarv16si;
break;
case E_V16SFmode:
gen = gen_avx512f_permvarv16sf;
@@ -5702,6 +5702,8 @@ ix86_expand_vec_perm (rtx operands[])
return;
case E_V4SImode:
+ if (one_operand_shuffle)
+ break; /* Handled below for TARGET_AVX. */
/* By combining the two 128-bit input vectors into one 256-bit
input vector, we can use VPERMD and VPERMPS for the full
two-operand shuffle. */
@@ -5714,6 +5716,8 @@ ix86_expand_vec_perm (rtx operands[])
return;
case E_V4SFmode:
+ if (one_operand_shuffle)
+ break; /* Handled below for TARGET_AVX. */
t1 = gen_reg_rtx (V8SFmode);
t2 = gen_reg_rtx (V8SImode);
mask = gen_lowpart (V4SImode, mask);
@@ -5820,6 +5824,22 @@ ix86_expand_vec_perm (rtx operands[])
}
}
+ if (TARGET_AVX
+ && one_operand_shuffle
+ && (mode == V4SImode || mode == V4SFmode))
+ {
+ if (mode == V4SImode)
+ {
+ op0 = gen_lowpart (V4SFmode, op0);
+ t1 = gen_reg_rtx (V4SFmode);
+ emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask));
+ emit_move_insn (target, gen_lowpart (mode, t1));
+ }
+ else
+ emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask));
+ return;
+ }
+
if (TARGET_XOP)
{
/* The XOP VPPERM insn supports three inputs. By ignoring the
We IMHO don't need to handle V8SImode/V8SFmode similarly, for !TARGET_AVX2 the
mask mode is not valid and so we'll expand it as horrible code anyway, and for
TARGET_AVX2 we already emit reasonable code (vpermd and vpermps insns).