Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> This avoids leaving scalar if-converted code around for the case
> of BB vectorizing an if-converted loop body when using the very-cheap
> cost model.  In this case we scan not vectorized scalar stmts in
> the basic-block vectorized for COND_EXPRs and force the vectorization
> to be marked as not profitable.
>
> The patch also makes sure to always consider all BB vectorization
> subgraphs together for costing purposes when vectorizing an
> if-converted loop body.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

LGTM, although you obviously know this code better than I do.

Thanks,
Richard

> Thanks,
> Richard.
>
> 2021-08-24  Richard Biener  <rguent...@suse.de>
>
>       PR tree-optimization/100089
>       * tree-vectorizer.h (vect_slp_bb): Rename to ...
>       (vect_slp_if_converted_bb): ... this and get the original
>       loop as new argument.
>       * tree-vectorizer.c (try_vectorize_loop_1): Revert previous fix,
>       pass original loop to vect_slp_if_converted_bb.
>       * tree-vect-slp.c (vect_bb_vectorization_profitable_p):
>       If orig_loop was passed scan the not vectorized stmts
>       for COND_EXPRs and force not profitable if found.
>       (vect_slp_region): Pass down all SLP instances to costing
>       if orig_loop was specified.
>       (vect_slp_bbs): Pass through orig_loop.
>       (vect_slp_bb): Rename to ...
>       (vect_slp_if_converted_bb): ... this and get the original
>       loop as new argument.
>       (vect_slp_function): Adjust.
> ---
>  gcc/tree-vect-slp.c   | 70 ++++++++++++++++++++++++++++++++++---------
>  gcc/tree-vectorizer.c | 20 +++++++------
>  gcc/tree-vectorizer.h |  2 +-
>  3 files changed, 68 insertions(+), 24 deletions(-)
>
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 3ed5bc1989a..8bfa45772d3 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -5287,7 +5287,8 @@ li_cost_vec_cmp (const void *a_, const void *b_)
>  
>  static bool
>  vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
> -                                 vec<slp_instance> slp_instances)
> +                                 vec<slp_instance> slp_instances,
> +                                 loop_p orig_loop)
>  {
>    slp_instance instance;
>    int i;
> @@ -5324,6 +5325,30 @@ vect_bb_vectorization_profitable_p (bb_vec_info 
> bb_vinfo,
>        vector_costs.safe_splice (instance->cost_vec);
>        instance->cost_vec.release ();
>      }
> +  /* When we're vectorizing an if-converted loop body with the
> +     very-cheap cost model make sure we vectorized all if-converted
> +     code.  */
> +  bool force_not_profitable = false;
> +  if (orig_loop && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
> +    {
> +      gcc_assert (bb_vinfo->bbs.length () == 1);
> +      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
> +        !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       /* The costing above left us with DCEable vectorized scalar
> +          stmts having the visited flag set.  */
> +       if (gimple_visited_p (gsi_stmt (gsi)))
> +         continue;
> +
> +       if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
> +         if (gimple_assign_rhs_code (ass) == COND_EXPR)
> +           {
> +             force_not_profitable = true;
> +             break;
> +           }
> +     }
> +    }
> +
>    /* Unset visited flag.  */
>    stmt_info_for_cost *cost;
>    FOR_EACH_VEC_ELT (scalar_costs, i, cost)
> @@ -5448,9 +5473,14 @@ vect_bb_vectorization_profitable_p (bb_vec_info 
> bb_vinfo,
>        return false;
>      }
>  
> +  if (dump_enabled_p () && force_not_profitable)
> +    dump_printf_loc (MSG_NOTE, vect_location,
> +                  "not profitable because of unprofitable if-converted "
> +                  "scalar code\n");
> +
>    scalar_costs.release ();
>    vector_costs.release ();
> -  return true;
> +  return !force_not_profitable;
>  }
>  
>  /* qsort comparator for lane defs.  */
> @@ -5895,7 +5925,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int 
> n_stmts, bool &fatal,
>  
>  static bool
>  vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
> -              vec<int> *dataref_groups, unsigned int n_stmts)
> +              vec<int> *dataref_groups, unsigned int n_stmts,
> +              loop_p orig_loop)
>  {
>    bb_vec_info bb_vinfo;
>    auto_vector_modes vector_modes;
> @@ -5944,7 +5975,9 @@ vect_slp_region (vec<basic_block> bbs, 
> vec<data_reference_p> datarefs,
>             vect_location = instance->location ();
>             if (!unlimited_cost_model (NULL)
>                 && !vect_bb_vectorization_profitable_p
> -                     (bb_vinfo, instance->subgraph_entries))
> +                     (bb_vinfo,
> +                      orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
> +                      : instance->subgraph_entries, orig_loop))
>               {
>                 for (slp_instance inst : instance->subgraph_entries)
>                   if (inst->kind == slp_inst_kind_bb_reduc)
> @@ -5965,7 +5998,9 @@ vect_slp_region (vec<basic_block> bbs, 
> vec<data_reference_p> datarefs,
>                                "using SLP\n");
>             vectorized = true;
>  
> -           vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
> +           vect_schedule_slp (bb_vinfo,
> +                              orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
> +                              : instance->subgraph_entries);
>  
>             unsigned HOST_WIDE_INT bytes;
>             if (dump_enabled_p ())
> @@ -5980,6 +6015,11 @@ vect_slp_region (vec<basic_block> bbs, 
> vec<data_reference_p> datarefs,
>                                    "basic block part vectorized using "
>                                    "variable length vectors\n");
>               }
> +
> +           /* When we're called from loop vectorization we're considering
> +              all subgraphs at once.  */
> +           if (orig_loop)
> +             break;
>           }
>       }
>        else
> @@ -6047,7 +6087,7 @@ vect_slp_region (vec<basic_block> bbs, 
> vec<data_reference_p> datarefs,
>     true if anything in the basic-block was vectorized.  */
>  
>  static bool
> -vect_slp_bbs (const vec<basic_block> &bbs)
> +vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
>  {
>    vec<data_reference_p> datarefs = vNULL;
>    auto_vec<int> dataref_groups;
> @@ -6077,18 +6117,20 @@ vect_slp_bbs (const vec<basic_block> &bbs)
>        ++current_group;
>      }
>  
> -  return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
> +  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
>  }
>  
> -/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
> -   true if anything in the basic-block was vectorized.  */
> +/* Special entry for the BB vectorizer.  Analyze and transform a single
> +   if-converted BB with ORIG_LOOPs body being the not if-converted
> +   representation.  Returns true if anything in the basic-block was
> +   vectorized.  */
>  
>  bool
> -vect_slp_bb (basic_block bb)
> +vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
>  {
>    auto_vec<basic_block> bbs;
>    bbs.safe_push (bb);
> -  return vect_slp_bbs (bbs);
> +  return vect_slp_bbs (bbs, orig_loop);
>  }
>  
>  /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
> @@ -6139,7 +6181,7 @@ vect_slp_function (function *fun)
>  
>        if (split && !bbs.is_empty ())
>       {
> -       r |= vect_slp_bbs (bbs);
> +       r |= vect_slp_bbs (bbs, NULL);
>         bbs.truncate (0);
>         bbs.quick_push (bb);
>       }
> @@ -6157,13 +6199,13 @@ vect_slp_function (function *fun)
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                              "splitting region at control altering "
>                              "definition %G", last);
> -         r |= vect_slp_bbs (bbs);
> +         r |= vect_slp_bbs (bbs, NULL);
>           bbs.truncate (0);
>         }
>      }
>  
>    if (!bbs.is_empty ())
> -    r |= vect_slp_bbs (bbs);
> +    r |= vect_slp_bbs (bbs, NULL);
>  
>    free (rpo);
>  
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 813f4683d1e..3aa3e2a6783 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -1033,10 +1033,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> 
> *&simduid_to_vf_htab,
>        only non-if-converted parts took part in BB vectorization.  */
>        if (flag_tree_slp_vectorize != 0
>         && loop_vectorized_call
> -       && ! loop->inner
> -       /* This would purely be a workaround and should be removed
> -          once PR100089 is fixed.  */
> -       && flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
> +       && ! loop->inner)
>       {
>         basic_block bb = loop->header;
>         bool require_loop_vectorize = false;
> @@ -1062,12 +1059,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> 
> *&simduid_to_vf_htab,
>             gimple_set_uid (stmt, -1);
>             gimple_set_visited (stmt, false);
>           }
> -       if (!require_loop_vectorize && vect_slp_bb (bb))
> +       if (!require_loop_vectorize)
>           {
> -           fold_loop_internal_call (loop_vectorized_call,
> -                                    boolean_true_node);
> -           loop_vectorized_call = NULL;
> -           ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
> +           tree arg = gimple_call_arg (loop_vectorized_call, 1);
> +           class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
> +           if (vect_slp_if_converted_bb (bb, scalar_loop))
> +             {
> +               fold_loop_internal_call (loop_vectorized_call,
> +                                        boolean_true_node);
> +               loop_vectorized_call = NULL;
> +               ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
> +             }
>           }
>       }
>        /* If outer loop vectorization fails for LOOP_VECTORIZED guarded
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 0ccbaafd50d..6e90217373f 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2087,7 +2087,7 @@ extern void vect_gather_slp_loads (vec_info *);
>  extern void vect_get_slp_defs (slp_tree, vec<tree> *);
>  extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *,
>                              unsigned n = -1U);
> -extern bool vect_slp_bb (basic_block);
> +extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop);
>  extern bool vect_slp_function (function *);
>  extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree);
>  extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree);

Reply via email to