Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On Mon, Oct 20, 2014 at 5:19 PM, Ilya Tocar wrote: >> > >> > The patch is OK with the above improvement. >> > >> > >> >> Will commit version below, if no objections in 24 hours. >> >> > Sorry, > I've missed palignr, which should also have v64qi version, > and lost return in expand_vec_perm_palignr case > (this caused avx512f-vec-unpack test failures). > Patch below fixes it. Ok for trunk? > > 2014-10-20 Ilya Tocar > > * config/i386/i386.c (expand_vec_perm_1): Fix > expand_vec_perm_palignr case. > * config/i386/sse.md (_palignr_mask): Use > VI1_AVX512. OK. Thanks, Uros.
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
> > > > The patch is OK with the above improvement. > > > > > > Will commit version below, if no objections in 24 hours. > > Sorry, I've missed palignr, which should also have v64qi version, and lost return in expand_vec_perm_palignr case (this caused avx512f-vec-unpack test failures). Patch below fixes it. Ok for trunk? 2014-10-20 Ilya Tocar * config/i386/i386.c (expand_vec_perm_1): Fix expand_vec_perm_palignr case. * config/i386/sse.md (_palignr_mask): Use VI1_AVX512. --- gcc/config/i386/i386.c | 1 + gcc/config/i386/sse.md | 12 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 33b21f4..34273ca 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -43552,6 +43552,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) /* Try the AVX2 vpalignr instruction. */ if (expand_vec_perm_palignr (d, true)) +return true; /* Try the AVX512F vpermi2 instructions. */ if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8157045..a3f336f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -13716,14 +13716,14 @@ (set_attr "mode" "DI")]) (define_insn "_palignr_mask" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=v") -(vec_merge:VI1_AVX2 - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "v") -(match_operand:VI1_AVX2 2 "nonimmediate_operand" "vm") + [(set (match_operand:VI1_AVX512 0 "register_operand" "=v") +(vec_merge:VI1_AVX512 + (unspec:VI1_AVX512 + [(match_operand:VI1_AVX512 1 "register_operand" "v") +(match_operand:VI1_AVX512 2 "nonimmediate_operand" "vm") (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")] UNSPEC_PALIGNR) - (match_operand:VI1_AVX2 4 "vector_move_operand" "0C") + (match_operand:VI1_AVX512 4 "vector_move_operand" "0C") (match_operand: 5 "register_operand" "Yk")))] "TARGET_AVX512BW && ( == 64 || TARGET_AVX512VL)" { -- 1.8.3.1
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On Thu, Oct 16, 2014 at 02:23:16PM +0400, Ilya Tocar wrote: > On 10 Oct 18:37, Uros Bizjak wrote: > > On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar wrote: > > > > > > Please recode that horrible first switch statement to: > > > > --cut here-- > > rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; > > > > switch (mode) > > { > > case V8HImode: > > if (TARGET_AVX512VL && TARGET_AVX152BW) > > gen = gen_avx512vl_vpermi2varv8hi3; > > break; > > > > ... > > > > case V2DFmode: > > if (TARGET_AVX512VL) > > { > > gen = gen_avx512vl_vpermi2varv2df3; > > maskmode = V2DImode; > > > > The patch is OK with the above improvement. > > > > Thanks, > > Uros. > > > > Will commit version below, if no objections in 24 hours. No need to wait, it is ok now (with proper ChangeLog of course). Jakub
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On 10 Oct 18:37, Uros Bizjak wrote: > On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar wrote: > > > Please recode that horrible first switch statement to: > > --cut here-- > rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; > > switch (mode) > { > case V8HImode: > if (TARGET_AVX512VL && TARGET_AVX152BW) > gen = gen_avx512vl_vpermi2varv8hi3; > break; > > ... > > case V2DFmode: > if (TARGET_AVX512VL) > { > gen = gen_avx512vl_vpermi2varv2df3; > maskmode = V2DImode; > > The patch is OK with the above improvement. > > Thanks, > Uros. > Will commit version below, if no objections in 24 hours. --- gcc/config/i386/i386.c | 292 ++--- gcc/config/i386/sse.md | 45 2 files changed, 255 insertions(+), 82 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index aedac19..e1228e3 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21411,35 +21411,132 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* AVX512F does support 64-byte integer vector operations, + thus the longest vector we are faced with is V64QImode. */ +#define MAX_VECT_LEN 64 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_operand_p; + bool testing_p; +}; + static bool -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, + struct expand_vec_perm_d *d) { - enum machine_mode mode = GET_MODE (op0); + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); + enum machine_mode maskmode = mode; + rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; + switch (mode) { +case V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_vpermi2varv8hi3; + break; +case V16HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_vpermi2varv16hi3; + break; +case V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vpermi2varv32hi3; + break; +case V4SImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv4si3; + break; +case V8SImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv8si3; + break; case V16SImode: - emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + gen = gen_avx512f_vpermi2varv16si3; + break; +case V4SFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv4sf3; + maskmode = V4SImode; + } + break; +case V8SFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv8sf3; + maskmode = V8SImode; + } + break; case V16SFmode: - emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + { + gen = gen_avx512f_vpermi2varv16sf3; + maskmode = V16SImode; + } + break; +case V2DImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv2di3; + break; +case V4DImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv4di3; + break; case V8DImode: - emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, - force_reg (V8DImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + gen = gen_avx512f_vpermi2varv8di3; + break; +case V2DFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv2df3; + maskmode = V2DImode; + } + break; +case V4DFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv4df3; + maskmode = V4DImode; + } + break; case V8DFmode: - emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, - force_reg (V8DImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + { + gen = gen_avx512f_vpermi2varv8df3; + maskmode = V8DImode; + } + break; default: - return false; + break; } + + if (gen == NULL) +return false; + + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + if (d
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar wrote: >> My strong preference would be: >> enum machine_mode maskmode = mode; >> rtx (*gen) (rtx, rtx, rtx, rtx); >> right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0); >> line and then inside of the first switch just do: >> ... >> case V16SImode: >> if (!TARGET_AVX512F) >> return false; >> gen = gen_avx512f_vpermi2varv16si3; >> break; >> case V4SFmode: >> if (!TARGET_AVX512VL) >> return false; >> gen = gen_avx512vl_vpermi2varv4sf3; >> maskmode = V4SImode; >> break; >> ... >> etc., then in the mask = line use: >> mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); >> and finally instead of the second switch do: >> emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); >> return true; >> > Updated patch below. Please recode that horrible first switch statement to: --cut here-- rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; switch (mode) { case V8HImode: if (TARGET_AVX512VL && TARGET_AVX152BW) gen = gen_avx512vl_vpermi2varv8hi3; break; ... case V2DFmode: if (TARGET_AVX512VL) { gen = gen_avx512vl_vpermi2varv2df3; maskmode = V2DImode; } break; default: break; } if (gen == NULL) return false; --cut here-- The patch is OK with the above improvement. (Please also note that the patch has a bunch of i386.md changes that will clash with followup patch series). Thanks, Uros.
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On Fri, Oct 10, 2014 at 07:47:19PM +0400, Ilya Tocar wrote: > Updated patch below. You haven't posted ChangeLog entry this time, so using the last one: * config/i386/i386.c (MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2. ... * config/i386/sse.md (define_mode_iterator VI1_AVX512): New. I'd think you should avoid the line break after filename in these cases, so * config/i386/i386.c (MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2. ... * config/i386/sse.md (define_mode_iterator VI1_AVX512): New. Other than that nit it looks good to me. Jakub
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On 09 Oct 20:51, Jakub Jelinek wrote: > On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote: > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[]) > >return true; > > } > > > > -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) > > +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, > > struct expand_vec_perm_d *d) > > Too long line, please wrap it. > Fixed. > > { > > - enum machine_mode mode = GET_MODE (op0); > > + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); > > + > >switch (mode) > > { > > +case V8HImode: > > + if (!TARGET_AVX512VL || !TARGET_AVX512BW) > > + return false; > > My strong preference would be: > enum machine_mode maskmode = mode; > rtx (*gen) (rtx, rtx, rtx, rtx); > right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0); > line and then inside of the first switch just do: > ... > case V16SImode: > if (!TARGET_AVX512F) > return false; > gen = gen_avx512f_vpermi2varv16si3; > break; > case V4SFmode: > if (!TARGET_AVX512VL) > return false; > gen = gen_avx512vl_vpermi2varv4sf3; > maskmode = V4SImode; > break; > ... > etc., then in the mask = line use: > mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); > and finally instead of the second switch do: > emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); > return true; > Updated patch below. --- gcc/config/i386/i386.c | 281 +++-- gcc/config/i386/sse.md | 45 2 files changed, 253 insertions(+), 73 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 352ab81..2247da8 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21358,33 +21358,132 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* AVX512F does support 64-byte integer vector operations, + thus the longest vector we are faced with is V64QImode. */ +#define MAX_VECT_LEN 64 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_operand_p; + bool testing_p; +}; + static bool -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, + struct expand_vec_perm_d *d) { - enum machine_mode mode = GET_MODE (op0); + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); + enum machine_mode maskmode = mode; + rtx (*gen) (rtx, rtx, rtx, rtx); + switch (mode) { +case V8HImode: + if (!TARGET_AVX512VL || !TARGET_AVX512BW) + return false; + gen = gen_avx512vl_vpermi2varv8hi3; + break; +case V16HImode: + if (!TARGET_AVX512VL || !TARGET_AVX512BW) + return false; + gen = gen_avx512vl_vpermi2varv16hi3; + break; +case V32HImode: + if (!TARGET_AVX512BW) + return false; + gen = gen_avx512bw_vpermi2varv32hi3; + break; +case V4SImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4si3; + break; +case V8SImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv8si3; + break; case V16SImode: - emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (!TARGET_AVX512F) + return false; + gen = gen_avx512f_vpermi2varv16si3; + break; +case V4SFmode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4sf3; + maskmode = V4SImode; + break; +case V8SFmode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv8sf3; + maskmode = V8SImode; + break; case V16SFmode: - emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (!TARGET_AVX512F) + return false; + gen = gen_avx512f_vpermi2varv16sf3; + maskmode = V16SImode; + break; +case V2DImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv2di3; + break; +case V4DImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4di3; + break; case V8DImode: - emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, -force_reg (V8DImode, mask), op1)); - r
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote: > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[]) >return true; > } > > +/* AVX512F does support 64-byte integer vector operations, > + thus the longest vector we are faced with is V64QImode. */ > +#define MAX_VECT_LEN 64 > + > +struct expand_vec_perm_d > +{ > + rtx target, op0, op1; > + unsigned char perm[MAX_VECT_LEN]; > + enum machine_mode vmode; > + unsigned char nelt; > + bool one_operand_p; > + bool testing_p; > +}; > + > static bool > -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) > +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct > expand_vec_perm_d *d) Too long line, please wrap it. > { > - enum machine_mode mode = GET_MODE (op0); > + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); > + >switch (mode) > { > +case V8HImode: > + if (!TARGET_AVX512VL || !TARGET_AVX512BW) > + return false; > + break; > +case V16HImode: > + if (!TARGET_AVX512VL || !TARGET_AVX512BW) > + return false; > +case V32HImode: > + if (!TARGET_AVX512BW) > + return false; > + break; > +case V4SImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V8SImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V16SImode: > + if (!TARGET_AVX512F) > + return false; > + break; > +case V4SFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V8SFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V16SFmode: > + if (!TARGET_AVX512F) > + return false; > + break; > +case V2DImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V4DImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V8DImode: > + if (!TARGET_AVX512F) > + return false; > + break; > +case V2DFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V4DFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > +case V8DFmode: > + if (!TARGET_AVX512F) > + return false; > + break; > +default: > + return false; > +} > + > + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const > expander, > + so args are either in d, or in op0, op1 etc. */ > + if (d) > +{ > + rtx vec[64]; > + target = d->target; > + op0 = d->op0; > + op1 = d->op1; > + for (int i = 0; i < d->nelt; ++i) > + vec[i] = GEN_INT (d->perm[i]); > + mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec)); Shouldn't the mask use integral vector mode rather than floating? My strong preference would be: enum machine_mode maskmode = mode; rtx (*gen) (rtx, rtx, rtx, rtx); right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0); line and then inside of the first switch just do: ... case V16SImode: if (!TARGET_AVX512F) return false; gen = gen_avx512f_vpermi2varv16si3; break; case V4SFmode: if (!TARGET_AVX512VL) return false; gen = gen_avx512vl_vpermi2varv4sf3; maskmode = V4SImode; break; ... etc., then in the mask = line use: mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); and finally instead of the second switch do: emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); return true; Otherwise, the patch LGTM, but will leave the final approval to Uros. Jakub
Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
Hi, I think this patch should be split in 2 parts: V64QI related and non-V64QI related. This part contains non-V64QI related changes. Also I've noticed, that not all patterns using VI1_AVX2, actually have AVX512 versions, so fixed bogus patterns. On 06 Oct 16:10, Jakub Jelinek wrote: > On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote: > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx > > op0, rtx mask, rtx op1) > >enum machine_mode mode = GET_MODE (op0); > >switch (mode) > > { > > + /* There is no byte version of vpermi2. So we use vpermi2w. */ > > +case V64QImode: ... > > I believe this case doesn't belong to this function, other than this > case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and > so it should always do that, and there should be a separate function > that expands the worst case of V64QImode full 2 operand permutation. > See my previous mail, IMHO it is doable with 5 instructions rather than 7. > And IMHO we should have a separate function which emits that, supposedly > one for the constant permutations, one for the variable case (perhaps > then your 7 insn sequence is best?). This will be done in following patch. > > Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers, > supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes > right now plus D, either D would be NULL (then it would behave as now), or > SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed. > I.e. the function would start with a switch that would just contain the > if (...) > return false; > hunks plus break; for the success case, then code to generate CONST_VECTOR > if sel is NULL_RTX from d, and finally another switch with just the emit > cases. Done. > > > +case V8HImode: > > + if (!TARGET_AVX512VL) > > + return false; > > + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, > > + force_reg (V8HImode, mask), > > op1)); > > + return true; > > +case V16HImode: > > + if (!TARGET_AVX512VL) > > + return false; > > + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, > > +force_reg (V16HImode, mask), op1)); > > + return true; > > Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW? > I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I > think neither of these depends on the other one in GCC. That's unlike insns > where CPUID AVX512VL and AVX512F are mentioned together, because in GCC > AVX512VL depends on AVX512F. > Good catch! > > @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) > > > >if (d->one_operand_p) > > return false; > > - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) > > + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 && > > + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) > > Formatting, && belongs on the second line. > Fixed. > > +; > > + else if (TARGET_AVX512VL) > > I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here. > AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones, > and the == 32 and == 16 cases are handled because AVX512VL implies > TARGET_AVX2 and TARGET_SSE4_1, doesn't it? > As TARGET_AVX512VL always implies TARGET_AVX2 and TARGET_SSE4_1 and works only on 32/16-byte mode this case is redundant, so I've removed it. > > @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d > > *d) > > return false; > > } > > } > > + else if (GET_MODE_SIZE (d->vmode) == 64) > > + { > > + if (!TARGET_AVX512BW) > > + return false; > > + if (vmode == V64QImode) > > + { > > + for (i = 0; i < nelt; ++i) > > + if ((d->perm[i] ^ i) & (nelt / 4)) > > + return false; > > Missing comment, I'd duplicate the > /* vpshufb only works intra lanes, it is not > possible to shuffle bytes in between the lanes. */ > comment there. > Done. > > @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > > rtx (*gen) (rtx, rtx) = NULL; > > switch (d->vmode) > > { > > + case V64QImode: > > + if (TARGET_AVX512VL) > > VL? Isn't that BW? > > > + gen = gen_avx512bw_vec_dupv64qi; > > + break; > > case V32QImode: > > gen = gen_avx2_pbroadcastv32qi_1; > > break; > > + case V32HImode: > > + if (TARGET_AVX512VL) > > Ditto. > Fixed. > > @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > > mode = V8DImode; > >else if (mode == V16SFmode) > > mode = V16SImode; > > + else if (mode == V4DFmode) > > +mode = V4DImode; > > + else if (mode == V2DFmode) > > +mode = V2DImode; > > + else if (mod