Is the following patch ok? It passes bootstrap and make check. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 88142a8..91f6f21 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return true; }
+static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d); + /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D in a single instruction. */ @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pshufb (d)) return true; + /* Try the AVX2 vpshufb. */ + if (expand_vec_perm_vpshufb2_vpermq (d)) + return true; + /* Try the AVX512F vpermi2 instructions. */ rtx vec[64]; enum machine_mode mode = d->vmode; @@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) } /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify - the permutation using the SSSE3 palignr instruction. This succeeds + the permutation using the SSSE3/AVX2 palignr instruction. This succeeds when all of the elements in PERM fit within one vector and we merely need to shift them down so that a single vector permutation has a chance to succeed. */ @@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) unsigned i, nelt = d->nelt; unsigned min, max; bool in_order, ok; - rtx shift, target; + rtx shift, shift1, target, tmp; struct expand_vec_perm_d dcopy; - /* Even with AVX, palignr only operates on 128-bit vectors. */ - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + /* SSSE3 is required to apply PALIGNR on 16 bytes operands. */ + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_SSSE3) + return false; + } + /* AVX2 is required to apply PALIGNR on 32 bytes operands. */ + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX2) + return false; + } + /* Other sizes are not supported. */ + else return false; - min = nelt, max = 0; + min = 2 * nelt, max = 0; for (i = 0; i < nelt; ++i) { unsigned e = d->perm[i]; @@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) dcopy = *d; shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); - target = gen_reg_rtx (TImode); - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), - gen_lowpart (TImode, d->op0), shift)); + shift1 = GEN_INT ((min - nelt / 2) * + GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); + + if (GET_MODE_SIZE (d->vmode) != 32) + { + target = gen_reg_rtx (TImode); + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), + gen_lowpart (TImode, d->op0), shift)); + } + else + { + target = gen_reg_rtx (V2TImode); + tmp = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv2ti (tmp, + gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (33))); + if (min < nelt / 2) + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, tmp), + gen_lowpart (V2TImode, d->op0), + shift)); + else + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, d->op1), + gen_lowpart (V2TImode, tmp), + shift1)); + } + dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); dcopy.one_operand_p = true; On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <r...@redhat.com> wrote: > On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote: >> Agree on checks: >> >> /* PALIGNR of 2 128-bits registers takes only 1 instrucion. >> Requires SSSE3. */ >> if (GET_MODE_SIZE (d->vmode) == 16) >> { >> if(!TARGET_SSSE3) >> return false; >> } >> /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions: >> PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. */ >> else if (GET_MODE_SIZE (d->vmode) == 32) >> { >> if(!TARGET_AVX2) >> return false; >> } >> else >> return false; > > Thanks, much better. > > > r~