On Mon, May 9, 2022 at 4:19 PM Uros Bizjak <ubiz...@gmail.com> wrote: > > On Mon, May 9, 2022 at 7:24 AM Hongtao Liu <crazy...@gmail.com> wrote: > > > > On Mon, May 9, 2022 at 1:22 PM liuhongt via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > pand/pandn may be used to clear upper/lower bits of the operands, in > > > that case there will be 4-5 instructions for permutation, and it's > > > still better than scalar codes. > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > > Ok for trunk? > > > > > > > > > gcc/ChangeLog: > > > > > > PR target/105354 > > > * config/i386/i386-expand.cc > > > (expand_vec_perm_pslldq_psrldq_por): New function. > > > (ix86_expand_vec_perm_const_1): Try > > > expand_vec_perm_pslldq_psrldq_por for both 3-instruction and > > > 4/5-instruction sequence. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/pr105354-1.c: New test. > > > * gcc.target/i386/pr105354-2.c: New test. > > OK, with a slight adjustment below. > > Thanks, > Uros. > > > > --- > > > gcc/config/i386/i386-expand.cc | 109 +++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 +++++++++++++++++ > > > 3 files changed, 349 insertions(+) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-1.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-2.c > > > > > > diff --git a/gcc/config/i386/i386-expand.cc > > > b/gcc/config/i386/i386-expand.cc > > > index bc806ffa283..49231e964ba 100644 > > > --- a/gcc/config/i386/i386-expand.cc > > > +++ b/gcc/config/i386/i386-expand.cc > > > @@ -20941,6 +20941,108 @@ expand_vec_perm_vpshufb2_vpermq_even_odd > > > (struct expand_vec_perm_d *d) > > > return true; > > > } > > > > > > +/* Implement permutation with pslldq + psrldq + por when pshufb is not > > > + available. */ > > > +static bool > > > +expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool > > > pandn) > > > +{ > > > + unsigned i, nelt = d->nelt; > > > + unsigned start1, end1 = -1; > > > + machine_mode vmode = d->vmode, imode; > > > + int start2 = -1; > > > + bool clear_op0, clear_op1; > > > + unsigned inner_size; > > > + rtx op0, op1, dop1; > > > + rtx (*gen_vec_shr) (rtx, rtx, rtx); > > > + rtx (*gen_vec_shl) (rtx, rtx, rtx); > > > + > > > + /* pshufb is available under TARGET_SSSE3. */ > > > + if (TARGET_SSSE3 || !TARGET_SSE2 > > You don't have to check for TARGET_SSSE3 here. The > expand_vec_perm_pslldq_psrldq_por should be positioned in > ix86_expand_vec_perm const_1 in a place where more optimal insn > sequence is already generated when TARGET_SSSE3 is available. Changed and committed, thanks for the review. > > > > + /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */ > > > + || (vmode != E_V16QImode && vmode != E_V8HImode)) > > > + return false; > > > + > > > + start1 = d->perm[0]; > > > + for (i = 1; i < nelt; i++) > > > + { > > > + if (d->perm[i] != d->perm[i-1] + 1) > > > + { > > > + if (start2 == -1) > > > + { > > > + start2 = d->perm[i]; > > > + end1 = d->perm[i-1]; > > > + } > > > + else > > > + return false; > > > + } > > > + else if (d->perm[i] >= nelt > > > + && start2 == -1) > > > + { > > > + start2 = d->perm[i]; > > > + end1 = d->perm[i-1]; > > > + } > > > + } > > > + > > > + clear_op0 = end1 != nelt - 1; > > > + clear_op1 = start2 % nelt != 0; > > > + /* pandn/pand is needed to clear upper/lower bits of op0/op1. */ > > > + if (!pandn && (clear_op0 || clear_op1)) > > > + return false; > > > + > > > + if (d->testing_p) > > > + return true; > > > + > > > + gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : > > > gen_vec_shr_v8hi; > > > + gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : > > > gen_vec_shl_v8hi; > > > + imode = GET_MODE_INNER (vmode); > > > + inner_size = GET_MODE_BITSIZE (imode); > > > + op0 = gen_reg_rtx (vmode); > > > + op1 = gen_reg_rtx (vmode); > > > + > > > + if (start1) > > > + emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size))); > > > + else > > > + emit_move_insn (op0, d->op0); > > > + > > > + dop1 = d->op1; > > > + if (d->one_operand_p) > > > + dop1 = d->op0; > > > + > > > + int shl_offset = end1 - start1 + 1 - start2 % nelt; > > > + if (shl_offset) > > > + emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * > > > inner_size))); > > > + else > > > + emit_move_insn (op1, dop1); > > > + > > > + /* Clear lower/upper bits for op0/op1. */ > > > + if (clear_op0 || clear_op1) > > > + { > > > + rtx vec[16]; > > > + rtx const_vec; > > > + rtx clear; > > > + for (i = 0; i != nelt; i++) > > > + { > > > + if (i < (end1 - start1 + 1)) > > > + vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, > > > imode); > > > + else > > > + vec[i] = CONST0_RTX (imode); > > > + } > > > + const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec)); > > > + const_vec = validize_mem (force_const_mem (vmode, const_vec)); > > > + clear = force_reg (vmode, const_vec); > > > + > > > + if (clear_op0) > > > + emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear)); > > > + if (clear_op1) > > > + emit_move_insn (op1, gen_rtx_AND (vmode, > > > + gen_rtx_NOT (vmode, clear), > > > + op1)); > > > + } > > > + > > > + emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1)); > > > + return true; > > > +} > > > + > > > /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even > > > and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI > > > operands with two "and" and "pack" or two "shift" and "pack" insns. > > > @@ -21853,6 +21955,9 @@ ix86_expand_vec_perm_const_1 (struct > > > expand_vec_perm_d *d) > > > if (expand_vec_perm_pshufb2 (d)) > > > return true; > > > > > > + if (expand_vec_perm_pslldq_psrldq_por (d, false)) > > > + return true; > > > + > > > if (expand_vec_perm_interleave3 (d)) > > > return true; > > > > > > @@ -21891,6 +21996,10 @@ ix86_expand_vec_perm_const_1 (struct > > > expand_vec_perm_d *d) > > > if (expand_vec_perm_even_odd (d)) > > > return true; > > > > > > + /* Generate four or five instructions. */ > > > + if (expand_vec_perm_pslldq_psrldq_por (d, true)) > > > + return true; > > > + > > > /* Even longer sequences. */ > > > if (expand_vec_perm_vpshufb4_vpermq2 (d)) > > > return true; > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105354-1.c > > > b/gcc/testsuite/gcc.target/i386/pr105354-1.c > > > new file mode 100644 > > > index 00000000000..8d91ded7420 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr105354-1.c > > > @@ -0,0 +1,130 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ > > > +/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */ > > > +/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */ > > > +/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */ > > > +/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */ > > > +/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */ > > > + > > > +typedef short v8hi __attribute__((vector_size (16))); > > > +typedef char v16qi __attribute__((vector_size (16))); > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo (v16qi a, v16qi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 15, 16, 17, 18, 19, 20); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo1 (v16qi a, v16qi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 15, 18, 19, 20, 21, 22); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo2 (v16qi a, v16qi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 16, 17, 18, 19, 20, 21); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo3 (v16qi a, v16qi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 17, 18, 19, 20, 21, 22); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo4 (v8hi a, v8hi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo5 (v8hi a, v8hi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo6 (v8hi a, v8hi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo7 (v8hi a, v8hi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo8 (v16qi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 15, 16, 17, 18, 19, 20); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo9 (v16qi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 15, 18, 19, 20, 21, 22); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo10 (v16qi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 16, 17, 18, 19, 20, 21); > > > +} > > > + > > > +v16qi > > > +__attribute__((noipa)) > > > +foo11 (v16qi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, > > > + 13, 14, 17, 18, 19, 20, 21, 22); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo12 (v8hi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo13 (v8hi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo14 (v8hi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13); > > > +} > > > + > > > +v8hi > > > +__attribute__((noipa)) > > > +foo15 (v8hi a) > > > +{ > > > + return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14); > > > +} > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c > > > b/gcc/testsuite/gcc.target/i386/pr105354-2.c > > > new file mode 100644 > > > index 00000000000..b78b62e1e7e > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c > > > @@ -0,0 +1,110 @@ > > > +/* { dg-do run } */ > > > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ > > > +/* { dg-require-effective-target sse2 } */ > > > + > > > +#include "sse2-check.h" > > > + > > > +#include "pr105354-1.c" > > > +void > > > +sse2_test (void) > > > +{ > > > + union128i_b a, b, res_ab, exp_ab; > > > + union128i_w c, d, res_cd, exp_cd; > > > + > > > + for (int i = 0; i != 16;i++) > > > + { > > > + a.a[i] = i; > > > + b.a[i] = i + 16; > > > + res_ab.a[i] = 0; > > > + exp_ab.a[i] = -1; > > > + if (i <= 8) > > > + { > > > + c.a[i] = i; > > > + d.a[i] = i + 8; > > > + res_cd.a[i] = 0; > > > + exp_cd.a[i] = -1; > > > + } > > > + } > > > + > > > + res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x); > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 15, 16, 17, 18, 19, 20 }; > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort (); > > > + > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 15, 18, 19, 20, 21, 22 }; > > > + res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x); > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort(); > > > + > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 16, 17, 18, 19, 20, 21 }; > > > + res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x); > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort(); > > > + > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 17, 18, 19, 20, 21, 22 }; > > > + res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x); > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort(); > > > + > > > + res_ab.x = (__m128i)foo8 ((v16qi)a.x); > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 15, 0, 1, 2, 3, 4 }; > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort (); > > > + > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 15, 2, 3, 4, 5, 6 }; > > > + res_ab.x = (__m128i)foo9 ((v16qi)a.x); > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort(); > > > + > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 0, 1, 2, 3, 4, 5 }; > > > + res_ab.x = (__m128i)foo10 ((v16qi)a.x); > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort(); > > > + > > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, > > > 13, 14, 1, 2, 3, 4, 5, 6 }; > > > + res_ab.x = (__m128i)foo11 ((v16qi)a.x); > > > + if (check_union128i_b (exp_ab, res_ab.a)) > > > + abort(); > > > + > > > + res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x); > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 }; > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort (); > > > + > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 > > > }; > > > + res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x); > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort(); > > > + > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 > > > }; > > > + res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x); > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort(); > > > + > > > + res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x); > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 > > > }; > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort (); > > > + > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 }; > > > + res_cd.x = (__m128i)foo12 ((v8hi)c.x); > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort(); > > > + > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 }; > > > + res_cd.x = (__m128i)foo13 ((v8hi)c.x); > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort(); > > > + > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 }; > > > + res_cd.x = (__m128i)foo14 ((v8hi)c.x); > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort(); > > > + > > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 }; > > > + res_cd.x = (__m128i)foo15 ((v8hi)c.x); > > > + if (check_union128i_w (exp_cd, res_cd.a)) > > > + abort(); > > > + > > > +} > > > + > > > -- > > > 2.18.1 > > > > > > > > > -- > > BR, > > Hongtao
-- BR, Hongtao