On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao....@intel.com> wrote: > > Optimize > > _1 = *srcp_3(D); > _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>; > _5 = BIT_FIELD_REF <_4, 128, 0>; > > to > > _1 = *srcp_3(D); > _5 = BIT_FIELD_REF <_1, 128, 128>; > > the upper will finally be optimized to > > _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>; > > Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}. > Ok for trunk?
Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already handle this in the if (code == VEC_PERM_EXPR && constant_multiple_p (bit_field_offset (op), size, &idx)) { part of the code - maybe that needs to be enhanced to cover a contiguous stride in the VEC_PERM_EXPR. I see we have size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); if (maybe_ne (bit_field_size (op), size)) return false; where it will currently bail, so adjust that to check for a constant multiple. I also think we should only handle the case where the new bit_field_offset alignment is not worse than the original one. That said, I'd prefer if you integrate this transform with simplify_bitfield_ref. Richard. > > gcc/ChangeLog: > > PR tree-optimization/102583 > * gimple.h (gate_optimize_vector_load): Declare. > * match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6, > 7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128). > * tree-ssa-forwprop.cc (gate_optimize_vector_load): New > function. > (pass_forwprop::execute): Put condition codes in the upper new > function. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr102583.c: New test. > --- > gcc/gimple.h | 1 + > gcc/match.pd | 56 ++++++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++ > gcc/tree-ssa-forwprop.cc | 32 +++++++++----- > 4 files changed, 109 insertions(+), 10 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c > > diff --git a/gcc/gimple.h b/gcc/gimple.h > index 6b1e89ad74e..1747dae1193 100644 > --- a/gcc/gimple.h > +++ b/gcc/gimple.h > @@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct > function *, gimple *); > extern bool gimple_inexpensive_call_p (gcall *); > extern bool stmt_can_terminate_bb_p (gimple *); > extern location_t gimple_or_expr_nonartificial_location (gimple *, tree); > +extern bool gate_optimize_vector_load (gimple *); > > /* Return the disposition for a warning (or all warnings by default) > for a statement. */ > diff --git a/gcc/match.pd b/gcc/match.pd > index 6d691d302b3..ac214310251 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > } > (cmp @0 { res; }))))))))) > > +#if GIMPLE > +/* Simplify partail vector access, transform > + > + V8SI A; > + V4SI B; > + A = *PA; > + B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 }); > + C = BIT_FIELD_REF (B, 128, 0) > + > +to > + > + A = *PA; > + C = BIT_FIELD_REF (B, 128, 128); > + > +optimize_vector_load will eventually optimize the upper to > + > + C = BIT_FIELD_REF (*PA, 128, 128); */ > + > +(simplify > + (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos) > + (if (VECTOR_TYPE_P (type) > + && TYPE_MODE (type) != BLKmode > + && single_use (@2) > + && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0)) > + && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0)))) > + (with > + { > + unsigned HOST_WIDE_INT nelts = -1; > + if (!VECTOR_CST_NELTS (@1).is_constant (&nelts)) > + return NULL_TREE; > + tree inner_type = TREE_TYPE (type); > + unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type)); > + unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos); > + unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize); > + unsigned HOST_WIDE_INT start > + = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w)); > + > + for (unsigned HOST_WIDE_INT i = pos / elt_w + 1; i != size / elt_w; > i++) > + { > + /* Continuous area. */ > + if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1 > + != tree_to_uhwi (vector_cst_elt (@1, i - 1))) > + return NULL_TREE; > + } > + > + /* Aligned or support movmisalign_optab. */ > + unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type)); > + if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align > + || start * elt_w % dest_align) > + && (optab_handler (movmisalign_optab, TYPE_MODE (type)) > + == CODE_FOR_nothing)) > + return NULL_TREE; > + } > + (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); })))) > +#endif > + > /* Canonicalizations of BIT_FIELD_REFs. */ > > (simplify > diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c > b/gcc/testsuite/gcc.target/i386/pr102583.c > new file mode 100644 > index 00000000000..ff2ffb5e671 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr102583.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -O2" } */ > +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } > */ > +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */ > +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target > { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */ > + > +typedef int v16si __attribute__((vector_size(64))); > +typedef float v8sf __attribute__((vector_size(32))); > +typedef float v4sf __attribute__((vector_size(16))); > +typedef float v2sf __attribute__((vector_size(8))); > + > +v8sf part (v16si *srcp) > +{ > + v16si src = *srcp; > + return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8], > + (float)src[9], (float)src[10], (float)src[11], (float)src[12] }; > +} > + > +v4sf part1 (v16si *srcp) > +{ > + v16si src = *srcp; > + return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] > }; > +} > + > +v2sf part2 (v16si *srcp) > +{ > + v16si src = *srcp; > + return (v2sf) { (float)src[4], (float)src[5] }; > +} > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > index 484491fa1c5..2c8d8bc6dce 100644 > --- a/gcc/tree-ssa-forwprop.cc > +++ b/gcc/tree-ssa-forwprop.cc > @@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > return true; > } > > +/* Gate for optimize_vector_load. */ > +bool > +gate_optimize_vector_load (gimple* stmt) > +{ > + if (!is_gimple_assign (stmt)) > + return false; > + > + tree lhs = gimple_assign_lhs (stmt); > + tree rhs = gimple_assign_rhs1 (stmt); > + return (cfun > + && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE > + && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode > + /* After vector lowering rewrite all loads, but > + initially do not since this conflicts with > + vector CONSTRUCTOR to shuffle optimization. */ > + || (cfun->curr_properties & PROP_gimple_lvec)) > + && gimple_assign_load_p (stmt) > + && !gimple_has_volatile_ops (stmt) > + && !stmt_can_throw_internal (cfun, stmt) > + && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs))); > +} > > /* Rewrite the vector load at *GSI to component-wise loads if the load > is only used in BIT_FIELD_REF extractions with eventual intermediate > @@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun) > else > gsi_next (&gsi); > } > - else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE > - && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode > - /* After vector lowering rewrite all loads, but > - initially do not since this conflicts with > - vector CONSTRUCTOR to shuffle optimization. */ > - || (fun->curr_properties & PROP_gimple_lvec)) > - && gimple_assign_load_p (stmt) > - && !gimple_has_volatile_ops (stmt) > - && !stmt_can_throw_internal (cfun, stmt) > - && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs))) > + else if (gate_optimize_vector_load (stmt)) > optimize_vector_load (&gsi); > > else if (code == COMPLEX_EXPR) > -- > 2.18.1 >