Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-21 Thread Uros Bizjak
On Thu, Nov 20, 2014 at 5:25 PM, Evgeny Stupachenko evstu...@gmail.com wrote:
 Bootstrap / make check passed with updated patch.

 Is it still ok?

 It looks like we don't need expand_vec_perm_vpshufb2_vpermq_even_odd
 any more with the patch.
 However the clean up will be in the separate patch after appropriate testing.

 Modified ChangeLog:

 2014-11-20  Evgeny Stupachenko  evstu...@gmail.com

 gcc/testsuite
 PR target/60451
 * gcc.target/i386/pr60451.c: New.

 gcc/
 PR target/60451
 * config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
 (expand_vec_perm_even_odd_1): Add new expand for V8HI mode,
 replace for V16QI, V16HI and V32QI modes.
 (ix86_expand_vec_perm_const_1): Add new expand.

OK.

Thanks,
Uros.


Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-20 Thread Jakub Jelinek
On Thu, Nov 20, 2014 at 02:36:26PM +0300, Evgeny Stupachenko wrote:
 +  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
 + shuffles.  */

I think switch (d-vmode) would be more readable.

 +  op = gen_reg_rtx (d-vmode);
 +  t = gen_reg_rtx (V4DImode);
 +  emit_insn (gen_pack (op, dop0, dop1));
 +  emit_insn (gen_avx2_permv4di_1 (t, gen_lowpart (V4DImode, op),
 const0_rtx,

Too long line, wrap it?

Will leave the rest to Uros.

Jakub


Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-20 Thread Uros Bizjak
On Thu, Nov 20, 2014 at 12:36 PM, Evgeny Stupachenko evstu...@gmail.com wrote:
 Hi,

 The patch expand even/odd permutation using:
 and, and, pack in odd case
 shift, shift, pack in even case

 instead of current pshufb, pshufb, or or big set of unpack insns.

 AVX2/CORE bootstrap and make check passed.
 expensive tests are in progress

 Is it ok for trunk?

 Evgeny

 2014-11-20  Evgeny Stupachenko  evstu...@gmail.com

 gcc/testsuite
 PR target/60451
 * gcc.target/i386/pr60451.c: New.

 gcc/
 PR target/60451
 * config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
 (expand_vec_perm_even_odd_1): Add new expand for SSE cases,
 replace with for AVX2 cases.
 (ix86_expand_vec_perm_const_1): Add new expand.

OK with a couple of small adjustments below.

Thanks,
Uros.

 +/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
 +   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
 +   with two and and pack or two shift and pack insns.  We should
 +   have already failed all two instruction sequences.  */
 +
 +static bool
 +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
 +{
 +  rtx op, dop0, dop1, t, rperm[16];
 +  unsigned i, odd, c, s, nelt = d-nelt;
 +  bool end_perm = false;
 +  machine_mode half_mode;
 +  rtx (*gen_and) (rtx, rtx, rtx);
 +  rtx (*gen_pack) (rtx, rtx, rtx);
 +  rtx (*gen_shift) (rtx, rtx, rtx);
 +
 +  /* Required for pack.  */
 +  if (!TARGET_SSE4_2 || d-one_operand_p)
 +return false;
 +
 +  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
 + shuffles.  */
 +  if (d-vmode == V8HImode)

Use switch, as proposed by Jakub.

 +{
 +  c = 0x;
 +  s = 16;
 +  half_mode = V4SImode;
 +  gen_and = gen_andv4si3;
 +  gen_pack = gen_sse4_1_packusdw;
 +  gen_shift = gen_lshrv4si3;
 +}
 +  else if (d-vmode == V16QImode)
 +{
 +  c = 0xff;
 +  s = 8;
 +  half_mode = V8HImode;
 +  gen_and = gen_andv8hi3;
 +  gen_pack = gen_sse2_packuswb;
 +  gen_shift = gen_lshrv8hi3;
 +}
 +  else if (d-vmode == V16HImode)
 +{
 +  c = 0x;
 +  s = 16;
 +  half_mode = V8SImode;
 +  gen_and = gen_andv8si3;
 +  gen_pack = gen_avx2_packusdw;
 +  gen_shift = gen_lshrv8si3;
 +  end_perm = true;
 +}
 +  else if (d-vmode == V32QImode)
 +{
 +  c = 0xff;
 +  s = 8;
 +  half_mode = V16HImode;
 +  gen_and = gen_andv16hi3;
 +  gen_pack = gen_avx2_packuswb;
 +  gen_shift = gen_lshrv16hi3;
 +  end_perm = true;
 +}
 +  else
 +return false;
 +
 +  /* Check that permutation is even or odd.  */
 +  odd = d-perm[0];
 +  if (odd != 0  odd != 1)

if (odd  1)

 +return false;
 +
 +  for (i = 1; i  nelt; ++i)
 +if (d-perm[i] != 2 * i + odd)
 +  return false;
 +
 +  if (d-testing_p)
 +return true;
 +
 +  dop0 = gen_reg_rtx (half_mode);
 +  dop1 = gen_reg_rtx (half_mode);
 +  if (odd == 0)
 +{
 +  for (i = 0; i  nelt / 2; rperm[i++] = GEN_INT (c));

Please write above as:

 for (i = 0; i  nelt / 2; i++)
 rperm[i] = GEN_INT (c));

 +  t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
 +  t = force_reg (half_mode, t);
 +  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d-op0)));
 +  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d-op1)));
 +}
 +  else
 +{
 +  emit_insn (gen_shift (dop0,
 +   gen_lowpart (half_mode, d-op0),
 +   GEN_INT (s)));
 +  emit_insn (gen_shift (dop1,
 +   gen_lowpart (half_mode, d-op1),
 +   GEN_INT (s)));
 +}
 +  /* In AVX2 for 256 bit case we need to permute pack result.  */
 +  if (TARGET_AVX2  end_perm)
 +{
 +  op = gen_reg_rtx (d-vmode);
 +  t = gen_reg_rtx (V4DImode);
 +  emit_insn (gen_pack (op, dop0, dop1));
 +  emit_insn (gen_avx2_permv4di_1 (t, gen_lowpart (V4DImode, op),
 const0_rtx,
 + const2_rtx, const1_rtx, GEN_INT (3)));
 +  emit_move_insn (d-target, gen_lowpart (d-vmode, t));
 +}
 +  else
 +emit_insn (gen_pack (d-target, dop0, dop1));
 +
 +  return true;
 +}
 +
  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
 and extract-odd permutations.  */

 @@ -48393,6 +48503,8 @@ expand_vec_perm_even_odd_1 (struct
 expand_vec_perm_d *d, unsigned odd)
gcc_unreachable ();

  case V8HImode:
 +  if (TARGET_SSE4_2)
 +   return expand_vec_perm_even_odd_pack (d);
if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)

else if in the above line, to be consistent with else below.

 return expand_vec_perm_pshufb2 (d);
else
 @@ -48416,6 +48528,8 @@ expand_vec_perm_even_odd_1 (struct
 expand_vec_perm_d *d, unsigned odd)
break;

  case V16QImode:
 +  if (TARGET_SSE4_2)
 +   return expand_vec_perm_even_odd_pack (d);
   

Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-20 Thread Evgeny Stupachenko
Thank you.
Patch with proposed fixes:

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 085eb54..09c0057 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48322,6 +48322,120 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
(struct expand_vec_perm_d *d)
   return true;
 }

+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+   with two and and pack or two shift and pack insns.  We should
+   have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+  rtx op, dop0, dop1, t, rperm[16];
+  unsigned i, odd, c, s, nelt = d-nelt;
+  bool end_perm = false;
+  machine_mode half_mode;
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_pack) (rtx, rtx, rtx);
+  rtx (*gen_shift) (rtx, rtx, rtx);
+
+  /* Required for pack.  */
+  if (!TARGET_SSE4_2 || d-one_operand_p)
+return false;
+
+  switch (d-vmode)
+{
+case V8HImode:
+  c = 0x;
+  s = 16;
+  half_mode = V4SImode;
+  gen_and = gen_andv4si3;
+  gen_pack = gen_sse4_1_packusdw;
+  gen_shift = gen_lshrv4si3;
+  break;
+case V16QImode:
+  c = 0xff;
+  s = 8;
+  half_mode = V8HImode;
+  gen_and = gen_andv8hi3;
+  gen_pack = gen_sse2_packuswb;
+  gen_shift = gen_lshrv8hi3;
+  break;
+case V16HImode:
+  c = 0x;
+  s = 16;
+  half_mode = V8SImode;
+  gen_and = gen_andv8si3;
+  gen_pack = gen_avx2_packusdw;
+  gen_shift = gen_lshrv8si3;
+  end_perm = true;
+  break;
+case V32QImode:
+  c = 0xff;
+  s = 8;
+  half_mode = V16HImode;
+  gen_and = gen_andv16hi3;
+  gen_pack = gen_avx2_packuswb;
+  gen_shift = gen_lshrv16hi3;
+  end_perm = true;
+  break;
+default:
+  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+general shuffles.  */
+  return false;
+}
+
+  /* Check that permutation is even or odd.  */
+  odd = d-perm[0];
+  if (odd  1)
+return false;
+
+  for (i = 1; i  nelt; ++i)
+if (d-perm[i] != 2 * i + odd)
+  return false;
+
+  if (d-testing_p)
+return true;
+
+  dop0 = gen_reg_rtx (half_mode);
+  dop1 = gen_reg_rtx (half_mode);
+  if (odd == 0)
+{
+  for (i = 0; i  nelt / 2; i++)
+   rperm[i] = GEN_INT (c);
+  t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+  t = force_reg (half_mode, t);
+  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d-op0)));
+  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d-op1)));
+}
+  else
+{
+  emit_insn (gen_shift (dop0,
+   gen_lowpart (half_mode, d-op0),
+   GEN_INT (s)));
+  emit_insn (gen_shift (dop1,
+   gen_lowpart (half_mode, d-op1),
+   GEN_INT (s)));
+}
+  /* In AVX2 for 256 bit case we need to permute pack result.  */
+  if (TARGET_AVX2  end_perm)
+{
+  op = gen_reg_rtx (d-vmode);
+  t = gen_reg_rtx (V4DImode);
+  emit_insn (gen_pack (op, dop0, dop1));
+  emit_insn (gen_avx2_permv4di_1 (t,
+ gen_lowpart (V4DImode, op),
+ const0_rtx,
+ const2_rtx,
+ const1_rtx,
+ GEN_INT (3)));
+  emit_move_insn (d-target, gen_lowpart (d-vmode, t));
+}
+  else
+emit_insn (gen_pack (d-target, dop0, dop1));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
and extract-odd permutations.  */

@@ -48393,7 +48507,9 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
   gcc_unreachable ();

 case V8HImode:
-  if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
+  if (TARGET_SSE4_2)
+   return expand_vec_perm_even_odd_pack (d);
+  else if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
   else
{
@@ -48416,7 +48532,9 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
   break;

 case V16QImode:
-  if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
+  if (TARGET_SSE4_2)
+   return expand_vec_perm_even_odd_pack (d);
+  else if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
   else
{
@@ -48441,7 +48559,7 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)

 case V16HImode:
 case V32QImode:
-  return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+  return expand_vec_perm_even_odd_pack (d);

 case V4DImode:
   if (!TARGET_AVX2)
@@ -48814,6 +48932,9 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)

   /* Try sequences of three instructions.  */

+  if (expand_vec_perm_even_odd_pack (d))
+return true;

Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-20 Thread Richard Henderson
On 11/20/2014 12:36 PM, Evgeny Stupachenko wrote:
 +  /* Required for pack.  */
 +  if (!TARGET_SSE4_2 || d-one_operand_p)
 +return false;

Why the SSE4_2 check here when...

 +
 +  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
 + shuffles.  */
 +  if (d-vmode == V8HImode)
 +{
 +  c = 0x;
 +  s = 16;
 +  half_mode = V4SImode;
 +  gen_and = gen_andv4si3;
 +  gen_pack = gen_sse4_1_packusdw;

... it's SSE4_1 here,

 +  gen_shift = gen_lshrv4si3;
 +}
 +  else if (d-vmode == V16QImode)
 +{
 +  c = 0xff;
 +  s = 8;
 +  half_mode = V8HImode;
 +  gen_and = gen_andv8hi3;
 +  gen_pack = gen_sse2_packuswb;

... and SSE2 here?



r~


Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-20 Thread Evgeny Stupachenko
Good point! gen_shift also requires only SSE2.
That way we can optimize out interleave sequence for V16QI mode in
expand_vec_perm_even_odd_1.
Thanks!

Evgeny

Updated patch:

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 085eb54..054089b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48322,6 +48322,127 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
(struct expand_vec_perm_d *d)
   return true;
 }

+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+   with two and and pack or two shift and pack insns.  We should
+   have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+  rtx op, dop0, dop1, t, rperm[16];
+  unsigned i, odd, c, s, nelt = d-nelt;
+  bool end_perm = false;
+  machine_mode half_mode;
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_pack) (rtx, rtx, rtx);
+  rtx (*gen_shift) (rtx, rtx, rtx);
+
+  if (d-one_operand_p)
+return false;
+
+  switch (d-vmode)
+{
+case V8HImode:
+  /* Required for pack.  */
+  if (!TARGET_SSE4_1)
+return false;
+  c = 0x;
+  s = 16;
+  half_mode = V4SImode;
+  gen_and = gen_andv4si3;
+  gen_pack = gen_sse4_1_packusdw;
+  gen_shift = gen_lshrv4si3;
+  break;
+case V16QImode:
+  /* No check as all instructions are SSE2.  */
+  c = 0xff;
+  s = 8;
+  half_mode = V8HImode;
+  gen_and = gen_andv8hi3;
+  gen_pack = gen_sse2_packuswb;
+  gen_shift = gen_lshrv8hi3;
+  break;
+case V16HImode:
+  if (!TARGET_AVX2)
+return false;
+  c = 0x;
+  s = 16;
+  half_mode = V8SImode;
+  gen_and = gen_andv8si3;
+  gen_pack = gen_avx2_packusdw;
+  gen_shift = gen_lshrv8si3;
+  end_perm = true;
+  break;
+case V32QImode:
+  if (!TARGET_AVX2)
+return false;
+  c = 0xff;
+  s = 8;
+  half_mode = V16HImode;
+  gen_and = gen_andv16hi3;
+  gen_pack = gen_avx2_packuswb;
+  gen_shift = gen_lshrv16hi3;
+  end_perm = true;
+  break;
+default:
+  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+general shuffles.  */
+  return false;
+}
+
+  /* Check that permutation is even or odd.  */
+  odd = d-perm[0];
+  if (odd  1)
+return false;
+
+  for (i = 1; i  nelt; ++i)
+if (d-perm[i] != 2 * i + odd)
+  return false;
+
+  if (d-testing_p)
+return true;
+
+  dop0 = gen_reg_rtx (half_mode);
+  dop1 = gen_reg_rtx (half_mode);
+  if (odd == 0)
+{
+  for (i = 0; i  nelt / 2; i++)
+   rperm[i] = GEN_INT (c);
+  t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+  t = force_reg (half_mode, t);
+  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d-op0)));
+  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d-op1)));
+}
+  else
+{
+  emit_insn (gen_shift (dop0,
+   gen_lowpart (half_mode, d-op0),
+   GEN_INT (s)));
+  emit_insn (gen_shift (dop1,
+   gen_lowpart (half_mode, d-op1),
+   GEN_INT (s)));
+}
+  /* In AVX2 for 256 bit case we need to permute pack result.  */
+  if (TARGET_AVX2  end_perm)
+{
+  op = gen_reg_rtx (d-vmode);
+  t = gen_reg_rtx (V4DImode);
+  emit_insn (gen_pack (op, dop0, dop1));
+  emit_insn (gen_avx2_permv4di_1 (t,
+ gen_lowpart (V4DImode, op),
+ const0_rtx,
+ const2_rtx,
+ const1_rtx,
+ GEN_INT (3)));
+  emit_move_insn (d-target, gen_lowpart (d-vmode, t));
+}
+  else
+emit_insn (gen_pack (d-target, dop0, dop1));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
and extract-odd permutations.  */

@@ -48393,7 +48514,9 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
   gcc_unreachable ();

 case V8HImode:
-  if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
+  if (TARGET_SSE4_1)
+   return expand_vec_perm_even_odd_pack (d);
+  else if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
   else
{
@@ -48416,32 +48539,11 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
   break;

 case V16QImode:
-  if (TARGET_SSSE3  !TARGET_SLOW_PSHUFB)
-   return expand_vec_perm_pshufb2 (d);
-  else
-   {
- if (d-testing_p)
-   break;
- t1 = gen_reg_rtx (V16QImode);
- t2 = gen_reg_rtx (V16QImode);
- t3 = gen_reg_rtx (V16QImode);
- emit_insn (gen_vec_interleave_highv16qi (t1, d-op0, d-op1));
- emit_insn (gen_vec_interleave_lowv16qi 

Re: [PATCH x86, PR60451] Expand even/odd permutation using pack insn.

2014-11-20 Thread Evgeny Stupachenko
Bootstrap / make check passed with updated patch.

Is it still ok?

It looks like we don't need expand_vec_perm_vpshufb2_vpermq_even_odd
any more with the patch.
However the clean up will be in the separate patch after appropriate testing.

Modified ChangeLog:

2014-11-20  Evgeny Stupachenko  evstu...@gmail.com

gcc/testsuite
PR target/60451
* gcc.target/i386/pr60451.c: New.

gcc/
PR target/60451
* config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
(expand_vec_perm_even_odd_1): Add new expand for V8HI mode,
replace for V16QI, V16HI and V32QI modes.
(ix86_expand_vec_perm_const_1): Add new expand.

On Thu, Nov 20, 2014 at 6:03 PM, Evgeny Stupachenko evstu...@gmail.com wrote:
 Good point! gen_shift also requires only SSE2.
 That way we can optimize out interleave sequence for V16QI mode in
 expand_vec_perm_even_odd_1.
 Thanks!

 Evgeny

 Updated patch:

 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
 index 085eb54..054089b 100644
 --- a/gcc/config/i386/i386.c
 +++ b/gcc/config/i386/i386.c
 @@ -48322,6 +48322,127 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
 (struct expand_vec_perm_d *d)
return true;
  }

 +/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
 +   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
 +   with two and and pack or two shift and pack insns.  We should
 +   have already failed all two instruction sequences.  */
 +
 +static bool
 +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
 +{
 +  rtx op, dop0, dop1, t, rperm[16];
 +  unsigned i, odd, c, s, nelt = d-nelt;
 +  bool end_perm = false;
 +  machine_mode half_mode;
 +  rtx (*gen_and) (rtx, rtx, rtx);
 +  rtx (*gen_pack) (rtx, rtx, rtx);
 +  rtx (*gen_shift) (rtx, rtx, rtx);
 +
 +  if (d-one_operand_p)
 +return false;
 +
 +  switch (d-vmode)
 +{
 +case V8HImode:
 +  /* Required for pack.  */
 +  if (!TARGET_SSE4_1)
 +return false;
 +  c = 0x;
 +  s = 16;
 +  half_mode = V4SImode;
 +  gen_and = gen_andv4si3;
 +  gen_pack = gen_sse4_1_packusdw;
 +  gen_shift = gen_lshrv4si3;
 +  break;
 +case V16QImode:
 +  /* No check as all instructions are SSE2.  */
 +  c = 0xff;
 +  s = 8;
 +  half_mode = V8HImode;
 +  gen_and = gen_andv8hi3;
 +  gen_pack = gen_sse2_packuswb;
 +  gen_shift = gen_lshrv8hi3;
 +  break;
 +case V16HImode:
 +  if (!TARGET_AVX2)
 +return false;
 +  c = 0x;
 +  s = 16;
 +  half_mode = V8SImode;
 +  gen_and = gen_andv8si3;
 +  gen_pack = gen_avx2_packusdw;
 +  gen_shift = gen_lshrv8si3;
 +  end_perm = true;
 +  break;
 +case V32QImode:
 +  if (!TARGET_AVX2)
 +return false;
 +  c = 0xff;
 +  s = 8;
 +  half_mode = V16HImode;
 +  gen_and = gen_andv16hi3;
 +  gen_pack = gen_avx2_packuswb;
 +  gen_shift = gen_lshrv16hi3;
 +  end_perm = true;
 +  break;
 +default:
 +  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
 +general shuffles.  */
 +  return false;
 +}
 +
 +  /* Check that permutation is even or odd.  */
 +  odd = d-perm[0];
 +  if (odd  1)
 +return false;
 +
 +  for (i = 1; i  nelt; ++i)
 +if (d-perm[i] != 2 * i + odd)
 +  return false;
 +
 +  if (d-testing_p)
 +return true;
 +
 +  dop0 = gen_reg_rtx (half_mode);
 +  dop1 = gen_reg_rtx (half_mode);
 +  if (odd == 0)
 +{
 +  for (i = 0; i  nelt / 2; i++)
 +   rperm[i] = GEN_INT (c);
 +  t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
 +  t = force_reg (half_mode, t);
 +  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d-op0)));
 +  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d-op1)));
 +}
 +  else
 +{
 +  emit_insn (gen_shift (dop0,
 +   gen_lowpart (half_mode, d-op0),
 +   GEN_INT (s)));
 +  emit_insn (gen_shift (dop1,
 +   gen_lowpart (half_mode, d-op1),
 +   GEN_INT (s)));
 +}
 +  /* In AVX2 for 256 bit case we need to permute pack result.  */
 +  if (TARGET_AVX2  end_perm)
 +{
 +  op = gen_reg_rtx (d-vmode);
 +  t = gen_reg_rtx (V4DImode);
 +  emit_insn (gen_pack (op, dop0, dop1));
 +  emit_insn (gen_avx2_permv4di_1 (t,
 + gen_lowpart (V4DImode, op),
 + const0_rtx,
 + const2_rtx,
 + const1_rtx,
 + GEN_INT (3)));
 +  emit_move_insn (d-target, gen_lowpart (d-vmode, t));
 +}
 +  else
 +emit_insn (gen_pack (d-target, dop0, dop1));
 +
 +  return true;
 +}
 +
  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
 and extract-odd permutations.  */

 @@ -48393,7 +48514,9 @@