[PATCH] i?86 vec_perm fixes and improvements

2011-10-18 Thread Jakub Jelinek
Hi!

Now that there is a better testsuite for constant reshuffling, this patch
fixes various issues I found plus improves various permutations.
Bootstrapped/regtested on x86_64-linux and i686-linux, additionally
tested with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc 
RUNTESTFLAGS='--target_board=unix\{-msse2,-msse4,-mavx\} dg-torture.exp=vshuf*'
on AVX capable box and tested -mavx2 compiled tests on sde.
Ok for trunk?

Examples of improvements, say for V16HImode:
-   vpshuflw$228, a(%rip), %ymm0
+   vmovdqa a(%rip), %ymm0
vmovdqa %ymm0, c(%rip)
(for identity permutation), ICE vs.
+   vpbroadcastwa(%rip), %ymm0
+   vmovdqa %ymm0, c(%rip)
using vpbroadcast* for broadcast shuffle,
-   vpshufb .LC0(%rip), %ymm0, %ymm1
-   vpshufb .LC1(%rip), %ymm0, %ymm0
-   vpermq  $78, %ymm1, %ymm1
-   vpor%ymm1, %ymm0, %ymm0
+   vperm2i128  $0, %ymm0, %ymm0, %ymm0
+   vpshufb .LC0(%rip), %ymm0, %ymm0
when both lanes refer to just one lane,  20 insns (full two argument
non-constant shuffle) into:
+   vmovdqa a(%rip), %ymm0
+   vpunpcklwd  b(%rip), %ymm0, %ymm0
+   vpshufb .LC2(%rip), %ymm0, %ymm0
+   vmovdqa %ymm0, c(%rip)
(resp. vpunpckhwd) when interleave gives something vpshufb can reshuffle
afterwards,
-   vmovdqa a(%rip), %ymm0
-   vpshufb .LC11(%rip), %ymm0, %ymm1
-   vpshufb .LC12(%rip), %ymm0, %ymm0
-   vpermq  $78, %ymm1, %ymm1
-   vpor%ymm1, %ymm0, %ymm0
+   vpermq  $156, a(%rip), %ymm0
+   vpshufb .LC4(%rip), %ymm0, %ymm0
another case where vpermq can shuffle quadwords into something vpshufb can
reshuffle, etc.

2011-10-18  Jakub Jelinek  ja...@redhat.com

* config/i386/i386.c (ix86_expand_vec_perm): In merge_two use
mode SUBREG of operands[0] as target.
(valid_perm_using_mode_p): Don't ignore higher bits of d-perm.
(expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si.
(expand_vec_perm_1): Handle identity and some broadcast
permutations.
(expand_vec_perm_interleave2): Handle also 32-byte modes, using
vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation.
For d-testing_p return true earlier to avoid creating more GC
garbage.
(expand_vec_perm_vpermq_perm_1): New function.
(expand_vec_perm_vpshufb2_vpermq): For d-testing_p return true
earlier to avoid creating more GC garbage.  Fix handling of
V16HImode.  Avoid some SUBREGs in SET_DEST.
(expand_vec_perm_broadcast_1): Return false for 32-byte integer
vector modes.
(expand_vec_perm_vpshufb4_vpermq2): New function.
(ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1
and expand_vec_perm_vpshufb4_vpermq2.

--- gcc/config/i386/i386.c.jj   2011-10-17 22:27:39.0 +0200
+++ gcc/config/i386/i386.c  2011-10-18 14:08:58.0 +0200
@@ -19663,7 +19663,7 @@ ix86_expand_vec_perm (rtx operands[])
   mask = expand_simple_binop (maskmode, AND, mask, vt,
  NULL_RTX, 0, OPTAB_DIRECT);
 
-  xops[0] = operands[0];
+  xops[0] = gen_lowpart (mode, operands[0]);
   xops[1] = gen_lowpart (mode, t2);
   xops[2] = gen_lowpart (mode, t1);
   xops[3] = gen_rtx_EQ (maskmode, mask, vt);
@@ -35006,8 +35006,7 @@ valid_perm_using_mode_p (enum machine_mo
   return false;
 else
   for (j = 1; j  chunk; ++j)
-   if ((d-perm[i]  (d-nelt - 1)) + j
-   != (d-perm[i + j]  (d-nelt - 1)))
+   if (d-perm[i] + j != d-perm[i + j])
  return false;
 
   return true;
@@ -35138,6 +35137,8 @@ expand_vec_perm_pshufb (struct expand_ve
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
   else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+  else
+   emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
 }
   else
 {
@@ -35163,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_per
   if (d-op0 == d-op1)
 {
   int mask = nelt - 1;
+  bool identity_perm = true;
+  bool broadcast_perm = true;
 
   for (i = 0; i  nelt; i++)
-   perm2[i] = d-perm[i]  mask;
+   {
+ perm2[i] = d-perm[i]  mask;
+ if (perm2[i] != i)
+   identity_perm = false;
+ if (perm2[i])
+   broadcast_perm = false;
+   }
+
+  if (identity_perm)
+   {
+ if (!d-testing_p)
+   emit_move_insn (d-target, d-op0);
+ return true;
+   }
+  else if (broadcast_perm  TARGET_AVX2)
+   {
+ /* Use vpbroadcast{b,w,d}.  */
+ rtx op = d-op0, (*gen) (rtx, rtx) = NULL;
+ switch (d-vmode)
+   {
+   case V32QImode:
+ op = gen_lowpart (V16QImode, op);
+ gen = gen_avx2_pbroadcastv32qi;
+ break;
+   case V16HImode:
+ op = gen_lowpart (V8HImode, op);
+ gen = 

Re: [PATCH] i?86 vec_perm fixes and improvements

2011-10-18 Thread Richard Henderson
On 10/18/2011 08:30 AM, Jakub Jelinek wrote:
   * config/i386/i386.c (ix86_expand_vec_perm): In merge_two use
   mode SUBREG of operands[0] as target.
   (valid_perm_using_mode_p): Don't ignore higher bits of d-perm.
   (expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si.
   (expand_vec_perm_1): Handle identity and some broadcast
   permutations.
   (expand_vec_perm_interleave2): Handle also 32-byte modes, using
   vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation.
   For d-testing_p return true earlier to avoid creating more GC
   garbage.
   (expand_vec_perm_vpermq_perm_1): New function.
   (expand_vec_perm_vpshufb2_vpermq): For d-testing_p return true
   earlier to avoid creating more GC garbage.  Fix handling of
   V16HImode.  Avoid some SUBREGs in SET_DEST.
   (expand_vec_perm_broadcast_1): Return false for 32-byte integer
   vector modes.
   (expand_vec_perm_vpshufb4_vpermq2): New function.
   (ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1
   and expand_vec_perm_vpshufb4_vpermq2.

Ok.


r~