Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-20 Thread Uros Bizjak
On Mon, Oct 20, 2014 at 5:19 PM, Ilya Tocar  wrote:
>> >
>> > The patch is OK with the above improvement.
>> >
>> >
>>
>> Will commit version below, if no objections in 24 hours.
>>
>>
> Sorry,
> I've missed palignr, which should also have v64qi version,
> and lost return in expand_vec_perm_palignr case
> (this caused avx512f-vec-unpack test failures).
> Patch below fixes it. Ok for trunk?
>
> 2014-10-20  Ilya Tocar  
>
> * config/i386/i386.c (expand_vec_perm_1): Fix
> expand_vec_perm_palignr case.
> * config/i386/sse.md (_palignr_mask): Use
> VI1_AVX512.

OK.

Thanks,
Uros.


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-20 Thread Ilya Tocar
> > 
> > The patch is OK with the above improvement.
> > 
> >
> 
> Will commit version below, if no objections in 24 hours.
> 
>
Sorry,
I've missed palignr, which should also have v64qi version,
and lost return in expand_vec_perm_palignr case
(this caused avx512f-vec-unpack test failures).
Patch below fixes it. Ok for trunk?

2014-10-20  Ilya Tocar  

* config/i386/i386.c (expand_vec_perm_1): Fix
expand_vec_perm_palignr case.
* config/i386/sse.md (_palignr_mask): Use
VI1_AVX512.

---
 gcc/config/i386/i386.c |  1 +
 gcc/config/i386/sse.md | 12 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 33b21f4..34273ca 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -43552,6 +43552,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 
   /* Try the AVX2 vpalignr instruction.  */
   if (expand_vec_perm_palignr (d, true))
+return true;
 
   /* Try the AVX512F vpermi2 instructions.  */
   if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8157045..a3f336f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13716,14 +13716,14 @@
(set_attr "mode" "DI")])
 
 (define_insn "_palignr_mask"
-  [(set (match_operand:VI1_AVX2 0 "register_operand" "=v")
-(vec_merge:VI1_AVX2
- (unspec:VI1_AVX2
-   [(match_operand:VI1_AVX2 1 "register_operand" "v")
-(match_operand:VI1_AVX2 2 "nonimmediate_operand" "vm")
+  [(set (match_operand:VI1_AVX512 0 "register_operand" "=v")
+(vec_merge:VI1_AVX512
+ (unspec:VI1_AVX512
+   [(match_operand:VI1_AVX512 1 "register_operand" "v")
+(match_operand:VI1_AVX512 2 "nonimmediate_operand" "vm")
 (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")]
UNSPEC_PALIGNR)
-   (match_operand:VI1_AVX2 4 "vector_move_operand" "0C")
+   (match_operand:VI1_AVX512 4 "vector_move_operand" "0C")
(match_operand: 5 "register_operand" "Yk")))]
   "TARGET_AVX512BW && ( == 64 || TARGET_AVX512VL)"
 {
-- 
1.8.3.1



Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-16 Thread Jakub Jelinek
On Thu, Oct 16, 2014 at 02:23:16PM +0400, Ilya Tocar wrote:
> On 10 Oct 18:37, Uros Bizjak wrote:
> > On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar  wrote:
> > 
> > 
> > Please recode that horrible first switch statement to:
> > 
> > --cut here--
> >   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
> > 
> >   switch (mode)
> > {
> > case V8HImode:
> >   if (TARGET_AVX512VL && TARGET_AVX152BW)
> > gen = gen_avx512vl_vpermi2varv8hi3;
> >   break;
> > 
> > ...
> > 
> > case V2DFmode:
> >   if (TARGET_AVX512VL)
> > {
> >   gen = gen_avx512vl_vpermi2varv2df3;
> >   maskmode = V2DImode;
> > 
> > The patch is OK with the above improvement.
> > 
> > Thanks,
> > Uros.
> >
> 
> Will commit version below, if no objections in 24 hours.

No need to wait, it is ok now (with proper ChangeLog of course).

Jakub


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-16 Thread Ilya Tocar
On 10 Oct 18:37, Uros Bizjak wrote:
> On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar  wrote:
> 
> 
> Please recode that horrible first switch statement to:
> 
> --cut here--
>   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
> 
>   switch (mode)
> {
> case V8HImode:
>   if (TARGET_AVX512VL && TARGET_AVX152BW)
> gen = gen_avx512vl_vpermi2varv8hi3;
>   break;
> 
> ...
> 
> case V2DFmode:
>   if (TARGET_AVX512VL)
> {
>   gen = gen_avx512vl_vpermi2varv2df3;
>   maskmode = V2DImode;
> 
> The patch is OK with the above improvement.
> 
> Thanks,
> Uros.
>

Will commit version below, if no objections in 24 hours.

---
 gcc/config/i386/i386.c | 292 ++---
 gcc/config/i386/sse.md |  45 
 2 files changed, 255 insertions(+), 82 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index aedac19..e1228e3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21411,35 +21411,132 @@ ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+/* AVX512F does support 64-byte integer vector operations,
+   thus the longest vector we are faced with is V64QImode.  */
+#define MAX_VECT_LEN   64
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p;
+};
+
 static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ struct expand_vec_perm_d *d)
 {
-  enum machine_mode mode = GET_MODE (op0);
+  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc.  */
+  enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
+  enum machine_mode maskmode = mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+
   switch (mode)
 {
+case V8HImode:
+  if (TARGET_AVX512VL && TARGET_AVX512BW)
+   gen = gen_avx512vl_vpermi2varv8hi3;
+  break;
+case V16HImode:
+  if (TARGET_AVX512VL && TARGET_AVX512BW)
+   gen = gen_avx512vl_vpermi2varv16hi3;
+  break;
+case V32HImode:
+  if (TARGET_AVX512BW)
+   gen = gen_avx512bw_vpermi2varv32hi3;
+  break;
+case V4SImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv4si3;
+  break;
+case V8SImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv8si3;
+  break;
 case V16SImode:
-  emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
-  force_reg (V16SImode, mask),
-  op1));
-  return true;
+  if (TARGET_AVX512F)
+   gen = gen_avx512f_vpermi2varv16si3;
+  break;
+case V4SFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv4sf3;
+ maskmode = V4SImode;
+   }
+  break;
+case V8SFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv8sf3;
+ maskmode = V8SImode;
+   }
+  break;
 case V16SFmode:
-  emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
-  force_reg (V16SImode, mask),
-  op1));
-  return true;
+  if (TARGET_AVX512F)
+   {
+ gen = gen_avx512f_vpermi2varv16sf3;
+ maskmode = V16SImode;
+   }
+  break;
+case V2DImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv2di3;
+  break;
+case V4DImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv4di3;
+  break;
 case V8DImode:
-  emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
- force_reg (V8DImode, mask),
- op1));
-  return true;
+  if (TARGET_AVX512F)
+   gen = gen_avx512f_vpermi2varv8di3;
+  break;
+case V2DFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv2df3;
+ maskmode = V2DImode;
+   }
+  break;
+case V4DFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv4df3;
+ maskmode = V4DImode;
+   }
+  break;
 case V8DFmode:
-  emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
- force_reg (V8DImode, mask),
- op1));
-  return true;
+  if (TARGET_AVX512F)
+   {
+ gen = gen_avx512f_vpermi2varv8df3;
+ maskmode = V8DImode;
+   }
+  break;
 default:
-  return false;
+  break;
 }
+
+  if (gen == NULL)
+return false;
+
+  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc.  */
+  if (d

Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-10 Thread Uros Bizjak
On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar  wrote:

>> My strong preference would be:
>>   enum machine_mode maskmode = mode;
>>   rtx (*gen) (rtx, rtx, rtx, rtx);
>> right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
>> line and then inside of the first switch just do:
>> ...
>> case V16SImode:
>>   if (!TARGET_AVX512F)
>>   return false;
>>   gen = gen_avx512f_vpermi2varv16si3;
>>   break;
>> case V4SFmode:
>>   if (!TARGET_AVX512VL)
>>   return false;
>>   gen = gen_avx512vl_vpermi2varv4sf3;
>>   maskmode = V4SImode;
>>   break;
>> ...
>> etc., then in the mask = line use:
>> mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
>> and finally instead of the second switch do:
>>   emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
>>   return true;
>>
> Updated patch below.

Please recode that horrible first switch statement to:

--cut here--
  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;

  switch (mode)
{
case V8HImode:
  if (TARGET_AVX512VL && TARGET_AVX152BW)
gen = gen_avx512vl_vpermi2varv8hi3;
  break;

...

case V2DFmode:
  if (TARGET_AVX512VL)
{
  gen = gen_avx512vl_vpermi2varv2df3;
  maskmode = V2DImode;
}
  break;

default:
  break;
}

  if (gen == NULL)
return false;
--cut here--

The patch is OK with the above improvement.

(Please also note that the patch has a bunch of i386.md changes that
will clash with followup patch series).

Thanks,
Uros.


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-10 Thread Jakub Jelinek
On Fri, Oct 10, 2014 at 07:47:19PM +0400, Ilya Tocar wrote:
> Updated patch below.

You haven't posted ChangeLog entry this time, so using the last one:

* config/i386/i386.c
   
(MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2.
   
...
* config/i386/sse.md
   
(define_mode_iterator VI1_AVX512): New. 
   
I'd think you should avoid the line break after filename in these
cases, so
* config/i386/i386.c (MAX_VECT_LEN): Move above
ix86_expand_vec_perm_vpermi2.
...
* config/i386/sse.md (define_mode_iterator VI1_AVX512): New.

Other than that nit it looks good to me.

Jakub


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-10 Thread Ilya Tocar
On 09 Oct 20:51, Jakub Jelinek wrote:
> On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote:
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
> >return true;
> >  }
> >  
> > -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
> > +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, 
> > struct expand_vec_perm_d *d)
> 
> Too long line, please wrap it.
>
Fixed.

> >  {
> > -  enum machine_mode mode = GET_MODE (op0);
> > +  enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
> > +
> >switch (mode)
> >  {
> > +case V8HImode:
> > +  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
> > +   return false;
> 
> My strong preference would be:
>   enum machine_mode maskmode = mode;
>   rtx (*gen) (rtx, rtx, rtx, rtx);
> right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
> line and then inside of the first switch just do:
> ...
> case V16SImode:
>   if (!TARGET_AVX512F)
>   return false;
>   gen = gen_avx512f_vpermi2varv16si3;
>   break;
> case V4SFmode:
>   if (!TARGET_AVX512VL)
>   return false;
>   gen = gen_avx512vl_vpermi2varv4sf3;
>   maskmode = V4SImode;
>   break;
> ...
> etc., then in the mask = line use:
> mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
> and finally instead of the second switch do:
>   emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
>   return true;
>
Updated patch below.

---
 gcc/config/i386/i386.c | 281 +++--
 gcc/config/i386/sse.md |  45 
 2 files changed, 253 insertions(+), 73 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..2247da8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21358,33 +21358,132 @@ ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+/* AVX512F does support 64-byte integer vector operations,
+   thus the longest vector we are faced with is V64QImode.  */
+#define MAX_VECT_LEN   64
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p;
+};
+
 static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ struct expand_vec_perm_d *d)
 {
-  enum machine_mode mode = GET_MODE (op0);
+  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc.  */
+  enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
+  enum machine_mode maskmode = mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+
   switch (mode)
 {
+case V8HImode:
+  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+   return false;
+  gen = gen_avx512vl_vpermi2varv8hi3; 
+  break;
+case V16HImode:
+  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+   return false;
+  gen = gen_avx512vl_vpermi2varv16hi3;
+  break;
+case V32HImode:
+  if (!TARGET_AVX512BW)
+   return false;
+  gen = gen_avx512bw_vpermi2varv32hi3; 
+  break;
+case V4SImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv4si3;
+  break;
+case V8SImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv8si3;
+  break;
 case V16SImode:
-  emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
-  return true;
+  if (!TARGET_AVX512F)
+   return false;
+  gen = gen_avx512f_vpermi2varv16si3;
+  break;
+case V4SFmode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv4sf3;
+  maskmode = V4SImode;
+  break;
+case V8SFmode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv8sf3;
+  maskmode = V8SImode;
+  break;
 case V16SFmode:
-  emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
-  return true;
+  if (!TARGET_AVX512F)
+   return false;
+  gen = gen_avx512f_vpermi2varv16sf3;
+  maskmode = V16SImode;
+  break;
+case V2DImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv2di3;
+  break;
+case V4DImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv4di3;
+  break;
 case V8DImode:
-  emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
-force_reg (V8DImode, mask), op1));
-  r

Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-09 Thread Jakub Jelinek
On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote:
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
>return true;
>  }
>  
> +/* AVX512F does support 64-byte integer vector operations,
> +   thus the longest vector we are faced with is V64QImode.  */
> +#define MAX_VECT_LEN 64
> +
> +struct expand_vec_perm_d
> +{
> +  rtx target, op0, op1;
> +  unsigned char perm[MAX_VECT_LEN];
> +  enum machine_mode vmode;
> +  unsigned char nelt;
> +  bool one_operand_p;
> +  bool testing_p;
> +};
> +
>  static bool
> -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
> +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct 
> expand_vec_perm_d *d)

Too long line, please wrap it.

>  {
> -  enum machine_mode mode = GET_MODE (op0);
> +  enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
> +
>switch (mode)
>  {
> +case V8HImode:
> +  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
> + return false;
> +  break;
> +case V16HImode:
> +  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
> + return false;
> +case V32HImode:
> +  if (!TARGET_AVX512BW)
> + return false;
> +  break;
> +case V4SImode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V8SImode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V16SImode:
> +  if (!TARGET_AVX512F)
> + return false;
> +  break;
> +case V4SFmode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V8SFmode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V16SFmode:
> +  if (!TARGET_AVX512F)
> + return false;
> +  break;
> +case V2DImode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V4DImode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V8DImode:
> +  if (!TARGET_AVX512F)
> + return false;
> +  break;
> +case V2DFmode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V4DFmode:
> +  if (!TARGET_AVX512VL)
> + return false;
> +  break;
> +case V8DFmode:
> +  if (!TARGET_AVX512F)
> + return false;
> +  break;
> +default:
> +  return false;
> +}
> +
> +  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const 
> expander,
> + so args are either in d, or in op0, op1 etc.  */
> +  if (d)
> +{
> +  rtx vec[64];
> +  target = d->target;
> +  op0 = d->op0;
> +  op1 = d->op1;
> +  for (int i = 0; i < d->nelt; ++i)
> + vec[i] = GEN_INT (d->perm[i]);
> +  mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec));

Shouldn't the mask use integral vector mode rather than floating?

My strong preference would be:
  enum machine_mode maskmode = mode;
  rtx (*gen) (rtx, rtx, rtx, rtx);
right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
line and then inside of the first switch just do:
...
case V16SImode:
  if (!TARGET_AVX512F)
return false;
  gen = gen_avx512f_vpermi2varv16si3;
  break;
case V4SFmode:
  if (!TARGET_AVX512VL)
return false;
  gen = gen_avx512vl_vpermi2varv4sf3;
  maskmode = V4SImode;
  break;
...
etc., then in the mask = line use:
mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
and finally instead of the second switch do:
  emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
  return true;

Otherwise, the patch LGTM, but will leave the final approval to Uros.

Jakub


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-09 Thread Ilya Tocar
Hi,

I think this patch should be split in 2 parts:
V64QI related and non-V64QI related.
This part contains non-V64QI related changes.
Also I've noticed, that not all patterns using VI1_AVX2,
actually have AVX512 versions, so fixed bogus patterns.

On 06 Oct 16:10, Jakub Jelinek wrote:
> On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote:
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx 
> > op0, rtx mask, rtx op1)
> >enum machine_mode mode = GET_MODE (op0);
> >switch (mode)
> >  {
> > +  /* There is no byte version of vpermi2.  So we use vpermi2w.  */
> > +case V64QImode:
...
> 
> I believe this case doesn't belong to this function, other than this
> case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
> so it should always do that, and there should be a separate function
> that expands the worst case of V64QImode full 2 operand permutation.
> See my previous mail, IMHO it is doable with 5 instructions rather than 7.
> And IMHO we should have a separate function which emits that, supposedly
> one for the constant permutations, one for the variable case (perhaps
> then your 7 insn sequence is best?).
This will be done in following patch.
> 
> Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
> supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
> right now plus D, either D would be NULL (then it would behave as now), or
> SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
> I.e. the function would start with a switch that would just contain the
> if (...)
> return false; 
> hunks plus break; for the success case, then code to generate CONST_VECTOR
> if sel is NULL_RTX from d, and finally another switch with just the emit
> cases.
Done.

> 
> > +case V8HImode:
> > +  if (!TARGET_AVX512VL)
> > +   return false;
> > +  emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
> > +  force_reg (V8HImode, mask), 
> > op1));
> > +  return true;
> > +case V16HImode:
> > +  if (!TARGET_AVX512VL)
> > +   return false;
> > +  emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
> > +force_reg (V16HImode, mask), op1));
> > +  return true;
> 
> Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW?
> I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
> think neither of these depends on the other one in GCC.  That's unlike insns
> where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
> AVX512VL depends on AVX512F.
>
Good catch!

> > @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
> >  
> >if (d->one_operand_p)
> >  return false;
> > -  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
> > +  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
> > +  GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
> 
> Formatting, && belongs on the second line.
> 
Fixed.

> > +;
> > +  else if (TARGET_AVX512VL)
> 
> I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
> AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
> and the == 32 and == 16 cases are handled because AVX512VL implies
> TARGET_AVX2 and TARGET_SSE4_1, doesn't it?
> 
As TARGET_AVX512VL always implies TARGET_AVX2 and TARGET_SSE4_1 and
works only on 32/16-byte mode this case is redundant, so I've removed
it.

> > @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d 
> > *d)
> >   return false;
> > }
> > }
> > +  else if (GET_MODE_SIZE (d->vmode) == 64)
> > +   {
> > + if (!TARGET_AVX512BW)
> > +   return false;
> > + if (vmode == V64QImode)
> > +   {
> > + for (i = 0; i < nelt; ++i)
> > +   if ((d->perm[i] ^ i) & (nelt / 4))
> > + return false;
> 
> Missing comment, I'd duplicate the
>   /* vpshufb only works intra lanes, it is not
>  possible to shuffle bytes in between the lanes.  */
> comment there.
> 
Done.

> > @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> >   rtx (*gen) (rtx, rtx) = NULL;
> >   switch (d->vmode)
> > {
> > +   case V64QImode:
> > + if (TARGET_AVX512VL)
> 
> VL?  Isn't that BW?
> 
> > +   gen = gen_avx512bw_vec_dupv64qi;
> > + break;
> > case V32QImode:
> >   gen = gen_avx2_pbroadcastv32qi_1;
> >   break;
> > +   case V32HImode:
> > + if (TARGET_AVX512VL)
> 
> Ditto.
>
Fixed.

> > @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> >  mode = V8DImode;
> >else if (mode == V16SFmode)
> >  mode = V16SImode;
> > +  else if (mode == V4DFmode)
> > +mode = V4DImode;
> > +  else if (mode == V2DFmode)
> > +mode = V2DImode;
> > +  else if (mod