Hi, The patch adds use of palignr instruction, when we have one operand permutation like: {5 6 7 0 1 2 3 4}:
Treating this as {5 6 7 8 9 a b c} on 2 operands, and therefore palignr on 5. Bootstrap and make check passed. Is it ok? Evgeny 2014-04-29 Evgeny Stupachenko <evstu...@gmail.com> * config/i386/i386.c (expand_vec_perm_palignr_one_operand): New. Enables PALIGNR on one operand permutation. * config/i386/i386.c (expand_vec_perm_1): Try PALIGNR on one operand. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 002d295..8950cf7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -42807,6 +42807,97 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return true; } +/* A subroutine of ix86_expand_vec_perm_1. Try to use just palignr + instruction for one operand permutation. This is better than pshufb + as does not require to pass big constant and faster on some x86 + architectures. */ + +static bool +expand_vec_perm_palignr_one_operand (struct expand_vec_perm_d *d) +{ + unsigned i, nelt = d->nelt; + unsigned min; + unsigned in_order_length, in_order_length_max; + rtx shift, shift1, target, tmp; + + /* PALIGNR of 2 128-bits registers takes only 1 instrucion. + Requires SSSE3. */ + if (GET_MODE_SIZE (d->vmode) == 16) + { + if(!TARGET_SSSE3) + return false; + } + /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions: + PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. */ + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if(!TARGET_AVX2) + return false; + } + else + return false; + + if (d->one_operand_p != true) + return false; + + /* For an in order permutation with one operand like: {5 6 7 0 1 2 3 4} + PALIGNR is better than PSHUFB. Check for an order in permutation. */ + in_order_length = 0; + in_order_length_max = 0; + if (d->one_operand_p == true) + for (i = 0; i < 2 * nelt; ++i) + { + if ((d->perm[(i + 1) & (nelt - 1)] - + d->perm[i & (nelt - 1)]) != 1) + { + if (in_order_length > in_order_length_max) + in_order_length_max = in_order_length; + in_order_length = 0; + } + else + in_order_length++; + } + + /* If not an ordered permutation then try something else. */ + if (in_order_length_max != nelt - 1) + return false; + + min = d->perm[0]; + + shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); + shift1 = GEN_INT ((min - nelt / 2) * + GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); + + if (GET_MODE_SIZE (d->vmode) != 32) + { + target = gen_reg_rtx (TImode); + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), + gen_lowpart (TImode, d->op0), shift)); + } + else + { + target = gen_reg_rtx (V2TImode); + tmp = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv2ti (tmp, + gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (33))); + if (min < nelt / 2) + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, tmp), + gen_lowpart (V2TImode, d->op0), + shift)); + else + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, d->op1), + gen_lowpart (V2TImode, tmp), + shift1)); + } + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + + return true; +} + static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d); /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D @@ -42943,6 +43034,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_vpermil (d)) return true; + /* Try palignr on one operand. */ + if (expand_vec_perm_palignr_one_operand (d)) + return true; + /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128, vpshufb, vpermd, vpermps or vpermq variable permutation. */ if (expand_vec_perm_pshufb (d))