Hello,
this patch expands __builtin_shuffle for V4DF mode in at most 3 insn. It
is simple and works really well, often generates only 2 insn. It is not
very generic, because other modes don't have an instruction equivalent to
vshufpd. For V8SF (and likely V4DI and V8SI with AVX2, but I still need to
do that), my patch default case in PR 52607 seems more interesting.
I tried calling this new function after expand_vec_perm_vperm2f128_vblend
(instead of before as in the patch), but it generated more instructions
for some permutations, and never less. That function is still useful for
V8SF though.
I bootstrapped gcc on a non-avx platform, compiled a program that tests
all 4096 shuffles with -mavx/-mavx2, and ran the result using Intel's
emulator (SDE).
There are still a few V4DF permutations that don't generate an optimal
sequence (3 insn instead of 2), but not that many I think. Of course, I am
assuming a constant cost of 1 per insn, which is completely false, but
seems like a sensible first approximation.
(note that I can't commit)
2012-04-17 Marc Glisse marc.gli...@inria.fr
PR target/502607
* config/i386/i386.c (ix86_expand_vec_perm_const): Move code to ...
(canonicalize_perm): ... new function.
(expand_vec_perm_2vperm2f128_vshuf): New function.
(ix86_expand_vec_perm_const_1): Call it.
--
Marc GlisseIndex: config/i386/i386.c
===
--- config/i386/i386.c (revision 186523)
+++ config/i386/i386.c (working copy)
@@ -32946,6 +32946,7 @@
bool testing_p;
};
+static bool canonicalize_perm (struct expand_vec_perm_d *d);
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
@@ -37003,6 +37004,57 @@
return true;
}
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
+ permutation using two vperm2f128, followed by a vshufpd insn blending
+ the two vectors together. */
+
+static bool
+expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond, dthird;
+ bool ok;
+
+ if (!TARGET_AVX || (d-vmode != V4DFmode))
+return false;
+
+ if (d-testing_p)
+return true;
+
+ dfirst = *d;
+ dsecond = *d;
+ dthird = *d;
+
+ dfirst.perm[0] = (d-perm[0] ~1);
+ dfirst.perm[1] = (d-perm[0] ~1) + 1;
+ dfirst.perm[2] = (d-perm[2] ~1);
+ dfirst.perm[3] = (d-perm[2] ~1) + 1;
+ dsecond.perm[0] = (d-perm[1] ~1);
+ dsecond.perm[1] = (d-perm[1] ~1) + 1;
+ dsecond.perm[2] = (d-perm[3] ~1);
+ dsecond.perm[3] = (d-perm[3] ~1) + 1;
+ dthird.perm[0] = (d-perm[0] % 2);
+ dthird.perm[1] = (d-perm[1] % 2) + 4;
+ dthird.perm[2] = (d-perm[2] % 2) + 2;
+ dthird.perm[3] = (d-perm[3] % 2) + 6;
+
+ dfirst.target = gen_reg_rtx (dfirst.vmode);
+ dsecond.target = gen_reg_rtx (dsecond.vmode);
+ dthird.op0 = dfirst.target;
+ dthird.op1 = dsecond.target;
+ dthird.one_operand_p = false;
+
+ canonicalize_perm (dfirst);
+ canonicalize_perm (dsecond);
+
+ ok = expand_vec_perm_1 (dfirst)
+expand_vec_perm_1 (dsecond)
+expand_vec_perm_1 (dthird);
+
+ gcc_assert (ok);
+
+ return true;
+}
+
/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
permutation with two pshufb insns and an ior. We should have already
failed all two instruction sequences. */
@@ -37652,6 +37704,9 @@
/* Try sequences of three instructions. */
+ if (expand_vec_perm_2vperm2f128_vshuf (d))
+return true;
+
if (expand_vec_perm_pshufb2 (d))
return true;
@@ -37689,12 +37744,56 @@
return false;
}
+/* If a permutation only uses one operand, make it clear. Returns true
+ if the permutation references both operands. */
+
+static bool
+canonicalize_perm (struct expand_vec_perm_d *d)
+{
+ int i, which, nelt = d-nelt;
+
+ for (i = which = 0; i nelt; ++i)
+ which |= (d-perm[i] nelt ? 1 : 2);
+
+ d-one_operand_p = true;
+ switch (which)
+{
+default:
+ gcc_unreachable();
+
+case 3:
+ if (!rtx_equal_p (d-op0, d-op1))
+{
+ d-one_operand_p = false;
+ break;
+}
+ /* The elements of PERM do not suggest that only the first operand
+is used, but both operands are identical. Allow easier matching
+of the permutation by folding the permutation into the single
+input vector. */
+ /* FALLTHRU */
+
+case 2:
+ for (i = 0; i nelt; ++i)
+d-perm[i] = nelt - 1;
+ d-op0 = d-op1;
+ break;
+
+case 1:
+ d-op1 = d-op0;
+ break;
+}
+
+ return (which == 3);
+}
+
bool
ix86_expand_vec_perm_const (rtx operands[4])
{
struct expand_vec_perm_d d;
unsigned char perm[MAX_VECT_LEN];
- int i, nelt, which;
+ int i, nelt;
+ bool two_args;
rtx sel;
d.target = operands[0];
@@ -37711,45 +37810,16 @@
gcc_assert (XVECLEN (sel, 0) == nelt);
gcc_checking_assert (sizeof