Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > This avoids leaving scalar if-converted code around for the case > of BB vectorizing an if-converted loop body when using the very-cheap > cost model. In this case we scan not vectorized scalar stmts in > the basic-block vectorized for COND_EXPRs and force the vectorization > to be marked as not profitable. > > The patch also makes sure to always consider all BB vectorization > subgraphs together for costing purposes when vectorizing an > if-converted loop body. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
LGTM, although you obviously know this code better than I do. Thanks, Richard > Thanks, > Richard. > > 2021-08-24 Richard Biener <rguent...@suse.de> > > PR tree-optimization/100089 > * tree-vectorizer.h (vect_slp_bb): Rename to ... > (vect_slp_if_converted_bb): ... this and get the original > loop as new argument. > * tree-vectorizer.c (try_vectorize_loop_1): Revert previous fix, > pass original loop to vect_slp_if_converted_bb. > * tree-vect-slp.c (vect_bb_vectorization_profitable_p): > If orig_loop was passed scan the not vectorized stmts > for COND_EXPRs and force not profitable if found. > (vect_slp_region): Pass down all SLP instances to costing > if orig_loop was specified. > (vect_slp_bbs): Pass through orig_loop. > (vect_slp_bb): Rename to ... > (vect_slp_if_converted_bb): ... this and get the original > loop as new argument. > (vect_slp_function): Adjust. > --- > gcc/tree-vect-slp.c | 70 ++++++++++++++++++++++++++++++++++--------- > gcc/tree-vectorizer.c | 20 +++++++------ > gcc/tree-vectorizer.h | 2 +- > 3 files changed, 68 insertions(+), 24 deletions(-) > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 3ed5bc1989a..8bfa45772d3 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -5287,7 +5287,8 @@ li_cost_vec_cmp (const void *a_, const void *b_) > > static bool > vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, > - vec<slp_instance> slp_instances) > + vec<slp_instance> slp_instances, > + loop_p orig_loop) > { > slp_instance instance; > int i; > @@ -5324,6 +5325,30 @@ vect_bb_vectorization_profitable_p (bb_vec_info > bb_vinfo, > vector_costs.safe_splice (instance->cost_vec); > instance->cost_vec.release (); > } > + /* When we're vectorizing an if-converted loop body with the > + very-cheap cost model make sure we vectorized all if-converted > + code. */ > + bool force_not_profitable = false; > + if (orig_loop && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP) > + { > + gcc_assert (bb_vinfo->bbs.length () == 1); > + for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); > + !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + /* The costing above left us with DCEable vectorized scalar > + stmts having the visited flag set. */ > + if (gimple_visited_p (gsi_stmt (gsi))) > + continue; > + > + if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi))) > + if (gimple_assign_rhs_code (ass) == COND_EXPR) > + { > + force_not_profitable = true; > + break; > + } > + } > + } > + > /* Unset visited flag. */ > stmt_info_for_cost *cost; > FOR_EACH_VEC_ELT (scalar_costs, i, cost) > @@ -5448,9 +5473,14 @@ vect_bb_vectorization_profitable_p (bb_vec_info > bb_vinfo, > return false; > } > > + if (dump_enabled_p () && force_not_profitable) > + dump_printf_loc (MSG_NOTE, vect_location, > + "not profitable because of unprofitable if-converted " > + "scalar code\n"); > + > scalar_costs.release (); > vector_costs.release (); > - return true; > + return !force_not_profitable; > } > > /* qsort comparator for lane defs. */ > @@ -5895,7 +5925,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int > n_stmts, bool &fatal, > > static bool > vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > - vec<int> *dataref_groups, unsigned int n_stmts) > + vec<int> *dataref_groups, unsigned int n_stmts, > + loop_p orig_loop) > { > bb_vec_info bb_vinfo; > auto_vector_modes vector_modes; > @@ -5944,7 +5975,9 @@ vect_slp_region (vec<basic_block> bbs, > vec<data_reference_p> datarefs, > vect_location = instance->location (); > if (!unlimited_cost_model (NULL) > && !vect_bb_vectorization_profitable_p > - (bb_vinfo, instance->subgraph_entries)) > + (bb_vinfo, > + orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo) > + : instance->subgraph_entries, orig_loop)) > { > for (slp_instance inst : instance->subgraph_entries) > if (inst->kind == slp_inst_kind_bb_reduc) > @@ -5965,7 +5998,9 @@ vect_slp_region (vec<basic_block> bbs, > vec<data_reference_p> datarefs, > "using SLP\n"); > vectorized = true; > > - vect_schedule_slp (bb_vinfo, instance->subgraph_entries); > + vect_schedule_slp (bb_vinfo, > + orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo) > + : instance->subgraph_entries); > > unsigned HOST_WIDE_INT bytes; > if (dump_enabled_p ()) > @@ -5980,6 +6015,11 @@ vect_slp_region (vec<basic_block> bbs, > vec<data_reference_p> datarefs, > "basic block part vectorized using " > "variable length vectors\n"); > } > + > + /* When we're called from loop vectorization we're considering > + all subgraphs at once. */ > + if (orig_loop) > + break; > } > } > else > @@ -6047,7 +6087,7 @@ vect_slp_region (vec<basic_block> bbs, > vec<data_reference_p> datarefs, > true if anything in the basic-block was vectorized. */ > > static bool > -vect_slp_bbs (const vec<basic_block> &bbs) > +vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop) > { > vec<data_reference_p> datarefs = vNULL; > auto_vec<int> dataref_groups; > @@ -6077,18 +6117,20 @@ vect_slp_bbs (const vec<basic_block> &bbs) > ++current_group; > } > > - return vect_slp_region (bbs, datarefs, &dataref_groups, insns); > + return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop); > } > > -/* Main entry for the BB vectorizer. Analyze and transform BB, returns > - true if anything in the basic-block was vectorized. */ > +/* Special entry for the BB vectorizer. Analyze and transform a single > + if-converted BB with ORIG_LOOPs body being the not if-converted > + representation. Returns true if anything in the basic-block was > + vectorized. */ > > bool > -vect_slp_bb (basic_block bb) > +vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop) > { > auto_vec<basic_block> bbs; > bbs.safe_push (bb); > - return vect_slp_bbs (bbs); > + return vect_slp_bbs (bbs, orig_loop); > } > > /* Main entry for the BB vectorizer. Analyze and transform BB, returns > @@ -6139,7 +6181,7 @@ vect_slp_function (function *fun) > > if (split && !bbs.is_empty ()) > { > - r |= vect_slp_bbs (bbs); > + r |= vect_slp_bbs (bbs, NULL); > bbs.truncate (0); > bbs.quick_push (bb); > } > @@ -6157,13 +6199,13 @@ vect_slp_function (function *fun) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "splitting region at control altering " > "definition %G", last); > - r |= vect_slp_bbs (bbs); > + r |= vect_slp_bbs (bbs, NULL); > bbs.truncate (0); > } > } > > if (!bbs.is_empty ()) > - r |= vect_slp_bbs (bbs); > + r |= vect_slp_bbs (bbs, NULL); > > free (rpo); > > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c > index 813f4683d1e..3aa3e2a6783 100644 > --- a/gcc/tree-vectorizer.c > +++ b/gcc/tree-vectorizer.c > @@ -1033,10 +1033,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> > *&simduid_to_vf_htab, > only non-if-converted parts took part in BB vectorization. */ > if (flag_tree_slp_vectorize != 0 > && loop_vectorized_call > - && ! loop->inner > - /* This would purely be a workaround and should be removed > - once PR100089 is fixed. */ > - && flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP) > + && ! loop->inner) > { > basic_block bb = loop->header; > bool require_loop_vectorize = false; > @@ -1062,12 +1059,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> > *&simduid_to_vf_htab, > gimple_set_uid (stmt, -1); > gimple_set_visited (stmt, false); > } > - if (!require_loop_vectorize && vect_slp_bb (bb)) > + if (!require_loop_vectorize) > { > - fold_loop_internal_call (loop_vectorized_call, > - boolean_true_node); > - loop_vectorized_call = NULL; > - ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals; > + tree arg = gimple_call_arg (loop_vectorized_call, 1); > + class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg)); > + if (vect_slp_if_converted_bb (bb, scalar_loop)) > + { > + fold_loop_internal_call (loop_vectorized_call, > + boolean_true_node); > + loop_vectorized_call = NULL; > + ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals; > + } > } > } > /* If outer loop vectorization fails for LOOP_VECTORIZED guarded > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 0ccbaafd50d..6e90217373f 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -2087,7 +2087,7 @@ extern void vect_gather_slp_loads (vec_info *); > extern void vect_get_slp_defs (slp_tree, vec<tree> *); > extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *, > unsigned n = -1U); > -extern bool vect_slp_bb (basic_block); > +extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop); > extern bool vect_slp_function (function *); > extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree); > extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree);