Optimize _1 = *srcp_3(D); _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>; _5 = BIT_FIELD_REF <_4, 128, 0>;
to _1 = *srcp_3(D); _5 = BIT_FIELD_REF <_1, 128, 128>; the upper will finally be optimized to _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>; Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}. Ok for trunk? gcc/ChangeLog: PR tree-optimization/102583 * gimple.h (gate_optimize_vector_load): Declare. * match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6, 7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128). * tree-ssa-forwprop.cc (gate_optimize_vector_load): New function. (pass_forwprop::execute): Put condition codes in the upper new function. gcc/testsuite/ChangeLog: * gcc.target/i386/pr102583.c: New test. --- gcc/gimple.h | 1 + gcc/match.pd | 56 ++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++ gcc/tree-ssa-forwprop.cc | 32 +++++++++----- 4 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c diff --git a/gcc/gimple.h b/gcc/gimple.h index 6b1e89ad74e..1747dae1193 100644 --- a/gcc/gimple.h +++ b/gcc/gimple.h @@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct function *, gimple *); extern bool gimple_inexpensive_call_p (gcall *); extern bool stmt_can_terminate_bb_p (gimple *); extern location_t gimple_or_expr_nonartificial_location (gimple *, tree); +extern bool gate_optimize_vector_load (gimple *); /* Return the disposition for a warning (or all warnings by default) for a statement. */ diff --git a/gcc/match.pd b/gcc/match.pd index 6d691d302b3..ac214310251 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) } (cmp @0 { res; }))))))))) +#if GIMPLE +/* Simplify partail vector access, transform + + V8SI A; + V4SI B; + A = *PA; + B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 }); + C = BIT_FIELD_REF (B, 128, 0) + +to + + A = *PA; + C = BIT_FIELD_REF (B, 128, 128); + +optimize_vector_load will eventually optimize the upper to + + C = BIT_FIELD_REF (*PA, 128, 128); */ + +(simplify + (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos) + (if (VECTOR_TYPE_P (type) + && TYPE_MODE (type) != BLKmode + && single_use (@2) + && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0)) + && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0)))) + (with + { + unsigned HOST_WIDE_INT nelts = -1; + if (!VECTOR_CST_NELTS (@1).is_constant (&nelts)) + return NULL_TREE; + tree inner_type = TREE_TYPE (type); + unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type)); + unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos); + unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize); + unsigned HOST_WIDE_INT start + = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w)); + + for (unsigned HOST_WIDE_INT i = pos / elt_w + 1; i != size / elt_w; i++) + { + /* Continuous area. */ + if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1 + != tree_to_uhwi (vector_cst_elt (@1, i - 1))) + return NULL_TREE; + } + + /* Aligned or support movmisalign_optab. */ + unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type)); + if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align + || start * elt_w % dest_align) + && (optab_handler (movmisalign_optab, TYPE_MODE (type)) + == CODE_FOR_nothing)) + return NULL_TREE; + } + (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); })))) +#endif + /* Canonicalizations of BIT_FIELD_REFs. */ (simplify diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c new file mode 100644 index 00000000000..ff2ffb5e671 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102583.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */ +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */ +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */ + +typedef int v16si __attribute__((vector_size(64))); +typedef float v8sf __attribute__((vector_size(32))); +typedef float v4sf __attribute__((vector_size(16))); +typedef float v2sf __attribute__((vector_size(8))); + +v8sf part (v16si *srcp) +{ + v16si src = *srcp; + return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8], + (float)src[9], (float)src[10], (float)src[11], (float)src[12] }; +} + +v4sf part1 (v16si *srcp) +{ + v16si src = *srcp; + return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] }; +} + +v2sf part2 (v16si *srcp) +{ + v16si src = *srcp; + return (v2sf) { (float)src[4], (float)src[5] }; +} diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index 484491fa1c5..2c8d8bc6dce 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) return true; } +/* Gate for optimize_vector_load. */ +bool +gate_optimize_vector_load (gimple* stmt) +{ + if (!is_gimple_assign (stmt)) + return false; + + tree lhs = gimple_assign_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); + return (cfun + && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE + && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode + /* After vector lowering rewrite all loads, but + initially do not since this conflicts with + vector CONSTRUCTOR to shuffle optimization. */ + || (cfun->curr_properties & PROP_gimple_lvec)) + && gimple_assign_load_p (stmt) + && !gimple_has_volatile_ops (stmt) + && !stmt_can_throw_internal (cfun, stmt) + && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs))); +} /* Rewrite the vector load at *GSI to component-wise loads if the load is only used in BIT_FIELD_REF extractions with eventual intermediate @@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun) else gsi_next (&gsi); } - else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE - && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode - /* After vector lowering rewrite all loads, but - initially do not since this conflicts with - vector CONSTRUCTOR to shuffle optimization. */ - || (fun->curr_properties & PROP_gimple_lvec)) - && gimple_assign_load_p (stmt) - && !gimple_has_volatile_ops (stmt) - && !stmt_can_throw_internal (cfun, stmt) - && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs))) + else if (gate_optimize_vector_load (stmt)) optimize_vector_load (&gsi); else if (code == COMPLEX_EXPR) -- 2.18.1