Hi!
Now that there is a better testsuite for constant reshuffling, this patch
fixes various issues I found plus improves various permutations.
Bootstrapped/regtested on x86_64-linux and i686-linux, additionally
tested with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc
RUNTESTFLAGS='--target_board=unix\{-msse2,-msse4,-mavx\} dg-torture.exp=vshuf*'
on AVX capable box and tested -mavx2 compiled tests on sde.
Ok for trunk?
Examples of improvements, say for V16HImode:
- vpshuflw$228, a(%rip), %ymm0
+ vmovdqa a(%rip), %ymm0
vmovdqa %ymm0, c(%rip)
(for identity permutation), ICE vs.
+ vpbroadcastwa(%rip), %ymm0
+ vmovdqa %ymm0, c(%rip)
using vpbroadcast* for broadcast shuffle,
- vpshufb .LC0(%rip), %ymm0, %ymm1
- vpshufb .LC1(%rip), %ymm0, %ymm0
- vpermq $78, %ymm1, %ymm1
- vpor%ymm1, %ymm0, %ymm0
+ vperm2i128 $0, %ymm0, %ymm0, %ymm0
+ vpshufb .LC0(%rip), %ymm0, %ymm0
when both lanes refer to just one lane, 20 insns (full two argument
non-constant shuffle) into:
+ vmovdqa a(%rip), %ymm0
+ vpunpcklwd b(%rip), %ymm0, %ymm0
+ vpshufb .LC2(%rip), %ymm0, %ymm0
+ vmovdqa %ymm0, c(%rip)
(resp. vpunpckhwd) when interleave gives something vpshufb can reshuffle
afterwards,
- vmovdqa a(%rip), %ymm0
- vpshufb .LC11(%rip), %ymm0, %ymm1
- vpshufb .LC12(%rip), %ymm0, %ymm0
- vpermq $78, %ymm1, %ymm1
- vpor%ymm1, %ymm0, %ymm0
+ vpermq $156, a(%rip), %ymm0
+ vpshufb .LC4(%rip), %ymm0, %ymm0
another case where vpermq can shuffle quadwords into something vpshufb can
reshuffle, etc.
2011-10-18 Jakub Jelinek ja...@redhat.com
* config/i386/i386.c (ix86_expand_vec_perm): In merge_two use
mode SUBREG of operands[0] as target.
(valid_perm_using_mode_p): Don't ignore higher bits of d-perm.
(expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si.
(expand_vec_perm_1): Handle identity and some broadcast
permutations.
(expand_vec_perm_interleave2): Handle also 32-byte modes, using
vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation.
For d-testing_p return true earlier to avoid creating more GC
garbage.
(expand_vec_perm_vpermq_perm_1): New function.
(expand_vec_perm_vpshufb2_vpermq): For d-testing_p return true
earlier to avoid creating more GC garbage. Fix handling of
V16HImode. Avoid some SUBREGs in SET_DEST.
(expand_vec_perm_broadcast_1): Return false for 32-byte integer
vector modes.
(expand_vec_perm_vpshufb4_vpermq2): New function.
(ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1
and expand_vec_perm_vpshufb4_vpermq2.
--- gcc/config/i386/i386.c.jj 2011-10-17 22:27:39.0 +0200
+++ gcc/config/i386/i386.c 2011-10-18 14:08:58.0 +0200
@@ -19663,7 +19663,7 @@ ix86_expand_vec_perm (rtx operands[])
mask = expand_simple_binop (maskmode, AND, mask, vt,
NULL_RTX, 0, OPTAB_DIRECT);
- xops[0] = operands[0];
+ xops[0] = gen_lowpart (mode, operands[0]);
xops[1] = gen_lowpart (mode, t2);
xops[2] = gen_lowpart (mode, t1);
xops[3] = gen_rtx_EQ (maskmode, mask, vt);
@@ -35006,8 +35006,7 @@ valid_perm_using_mode_p (enum machine_mo
return false;
else
for (j = 1; j chunk; ++j)
- if ((d-perm[i] (d-nelt - 1)) + j
- != (d-perm[i + j] (d-nelt - 1)))
+ if (d-perm[i] + j != d-perm[i + j])
return false;
return true;
@@ -35138,6 +35137,8 @@ expand_vec_perm_pshufb (struct expand_ve
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else
+ emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
}
else
{
@@ -35163,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_per
if (d-op0 == d-op1)
{
int mask = nelt - 1;
+ bool identity_perm = true;
+ bool broadcast_perm = true;
for (i = 0; i nelt; i++)
- perm2[i] = d-perm[i] mask;
+ {
+ perm2[i] = d-perm[i] mask;
+ if (perm2[i] != i)
+ identity_perm = false;
+ if (perm2[i])
+ broadcast_perm = false;
+ }
+
+ if (identity_perm)
+ {
+ if (!d-testing_p)
+ emit_move_insn (d-target, d-op0);
+ return true;
+ }
+ else if (broadcast_perm TARGET_AVX2)
+ {
+ /* Use vpbroadcast{b,w,d}. */
+ rtx op = d-op0, (*gen) (rtx, rtx) = NULL;
+ switch (d-vmode)
+ {
+ case V32QImode:
+ op = gen_lowpart (V16QImode, op);
+ gen = gen_avx2_pbroadcastv32qi;
+ break;
+ case V16HImode:
+ op = gen_lowpart (V8HImode, op);
+ gen =