On Tue, 8 Sep 2020, Richard Biener wrote:

> 
> This makes the BB vectorizer cost independent SLP subgraphs
> separately.  While on pristine trunk and for x86_64 I failed to
> distill a testcase where the vectorizer would think _any_
> basic-block vectorization opportunity is not profitable I do
> have pending work that would make the cost savings of a
> profitable opportunity make another independently not
> profitable opportunity vectorized.
> 
> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> 
> CCing some people to double-check my graph partitioning algorithm.

In fact I can amend gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c
to make it currently profitable to vectorize and after the patch
only vectorize the profitable part.  Consider the patch amended
with this.

This becomes more important once we merge Martins patches to
consider the whole function at once and not stop at basic-block
boundaries for finding SLP opportunities.

Richard.

diff --git 
a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c
index e65a30c06d6..ef74785f6a8 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c
@@ -74,10 +74,28 @@ foo (int* diff)
     d[13] = m[12] - m[13];
     d[14] = m[14] + m[15];
     d[15] = m[15] - m[14];
+    /* The following obviously profitable part should not make
+       the former unprofitable one profitable.  */
+    diff[16 + 16] = diff[16];
+    diff[17 + 16] = diff[17];
+    diff[18 + 16] = diff[18];
+    diff[19 + 16] = diff[19];
+    diff[20 + 16] = diff[20];
+    diff[21 + 16] = diff[21];
+    diff[22 + 16] = diff[22];
+    diff[23 + 16] = diff[23];
+    diff[24 + 16] = diff[24];
+    diff[25 + 16] = diff[25];
+    diff[26 + 16] = diff[26];
+    diff[27 + 16] = diff[27];
+    diff[28 + 16] = diff[28];
+    diff[29 + 16] = diff[29];
+    diff[30 + 16] = diff[30];
+    diff[31 + 16] = diff[31];
     for (k=0; k<16; k++)
       satd += abs(d[k]);
   return satd;
 }

 /* { dg-final { scan-tree-dump "vectorization is not profitable" "slp1" } 
} */
-/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing SLP tree" 1 "slp1" } } 
*/


> Richard.
> 
> 2020-09-08  Richard Biener  <rguent...@suse.de>
> 
>       PR tree-optimization/96043
>       * tree-vectorizer.h (_slp_instance::cost_vec): New.
>       (_slp_instance::subgraph_entries): Likewise.
>       (BB_VINFO_TARGET_COST_DATA): Remove.
>       * tree-vect-slp.c (vect_free_slp_instance): Free
>       cost_vec and subgraph_entries.
>       (vect_analyze_slp_instance): Initialize them.
>       (vect_slp_analyze_operations): Defer passing costs to
>       the target, instead record them in the SLP graph entry.
>       (get_ultimate_leader): New helper for graph partitioning.
>       (vect_bb_partition_graph_r): Likewise.
>       (vect_bb_partition_graph): New function to partition the
>       SLP graph into independently costable parts.
>       (vect_bb_vectorization_profitable_p): Adjust to work on
>       a subgraph.
>       (vect_bb_vectorization_profitable_p): New wrapper,
>       discarding non-profitable vectorization of subgraphs.
>       (vect_slp_analyze_bb_1): Call vect_bb_partition_graph before
>       costing.
> ---
>  gcc/tree-vect-slp.c   | 190 ++++++++++++++++++++++++++++++++++++++----
>  gcc/tree-vectorizer.h |   8 +-
>  2 files changed, 180 insertions(+), 18 deletions(-)
> 
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 2b7fd685ef9..35e8985d159 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -126,6 +126,8 @@ vect_free_slp_instance (slp_instance instance, bool 
> final_p)
>  {
>    vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
>    SLP_INSTANCE_LOADS (instance).release ();
> +  instance->subgraph_entries.release ();
> +  instance->cost_vec.release ();
>    free (instance);
>  }
>  
> @@ -2269,6 +2271,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
>         SLP_INSTANCE_LOADS (new_instance) = vNULL;
>         SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : 
> NULL;
>         new_instance->reduc_phis = NULL;
> +       new_instance->cost_vec = vNULL;
> +       new_instance->subgraph_entries = vNULL;
>  
>         vect_gather_slp_loads (new_instance, node);
>         if (dump_enabled_p ())
> @@ -3153,8 +3157,8 @@ vect_slp_analyze_operations (vec_info *vinfo)
>           visited.add (*x);
>         i++;
>  
> -       add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
> -       cost_vec.release ();
> +       /* Remember the SLP graph entry cost for later.  */
> +       instance->cost_vec = cost_vec;
>       }
>      }
>  
> @@ -3162,18 +3166,113 @@ vect_slp_analyze_operations (vec_info *vinfo)
>    if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
>      {
>        hash_set<stmt_vec_info> svisited;
> -      stmt_vector_for_cost cost_vec;
> -      cost_vec.create (2);
>        for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
>       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> -                                  instance, &cost_vec, svisited);
> -      add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
> -      cost_vec.release ();
> +                                  instance, &instance->cost_vec, svisited);
>      }
>  
>    return !vinfo->slp_instances.is_empty ();
>  }
>  
> +/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
> +   closing the eventual chain.  */
> +
> +static slp_instance
> +get_ultimate_leader (slp_instance instance,
> +                  hash_map<slp_instance, slp_instance> &instance_leader)
> +{
> +  auto_vec<slp_instance *, 8> chain;
> +  slp_instance *tem;
> +  while (*(tem = instance_leader.get (instance)) != instance)
> +    {
> +      chain.safe_push (tem);
> +      instance = *tem;
> +    }
> +  while (!chain.is_empty ())
> +    *chain.pop () = instance;
> +  return instance;
> +}
> +
> +/* Worker of vect_bb_partition_graph, recurse on NODE.  */
> +
> +static void
> +vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
> +                        slp_instance instance, slp_tree node,
> +                        hash_map<stmt_vec_info, slp_instance> 
> &stmt_to_instance,
> +                        hash_map<slp_instance, slp_instance> 
> &instance_leader)
> +{
> +  stmt_vec_info stmt_info;
> +  unsigned i;
> +  bool all = true;
> +  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
> +    {
> +      bool existed_p;
> +      slp_instance &stmt_instance
> +     = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
> +      if (!existed_p)
> +     {
> +       all = false;
> +     }
> +      else if (stmt_instance != instance)
> +     {
> +       /* If we're running into a previously marked stmt make us the
> +          leader of the current ultimate leader.  This keeps the
> +          leader chain acyclic and works even when the current instance
> +          connects two previously independent graph parts.  */
> +       stmt_instance = get_ultimate_leader (stmt_instance, instance_leader);
> +       if (stmt_instance != instance)
> +         instance_leader.put (stmt_instance, instance);
> +     }
> +      stmt_instance = instance;
> +    }
> +  /* If not all stmts had been visited we have to recurse on children.  */
> +  if (all)
> +    return;
> +
> +  slp_tree child;
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> +    if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> +      vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
> +                              instance_leader);
> +}
> +
> +/* Partition the SLP graph into pieces that can be costed independently.  */
> +
> +static void
> +vect_bb_partition_graph (bb_vec_info bb_vinfo)
> +{
> +  DUMP_VECT_SCOPE ("vect_bb_partition_graph");
> +
> +  /* First walk the SLP graph assigning each involved scalar stmt a
> +     corresponding SLP graph entry and upon visiting a previously
> +     marked stmt, make the stmts leader the current SLP graph entry.  */
> +  hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
> +  hash_map<slp_instance, slp_instance> instance_leader;
> +  slp_instance instance;
> +  for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
> +    {
> +      instance_leader.put (instance, instance);
> +      vect_bb_partition_graph_r (bb_vinfo,
> +                              instance, SLP_INSTANCE_TREE (instance),
> +                              stmt_to_instance, instance_leader);
> +    }
> +
> +  /* Then collect entries to each independent subgraphs.  */
> +  for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
> +    {
> +      slp_instance leader = get_ultimate_leader (instance, instance_leader);
> +      if (leader == instance)
> +     leader->subgraph_entries.safe_push (leader);
> +      else
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "instance %p is leader of %p\n",
> +                          leader, instance);
> +       leader->subgraph_entries.safe_push (instance);
> +     }
> +    }
> +}
>  
>  /* Compute the scalar cost of the SLP node NODE and its children
>     and return it.  Do not account defs that are marked in LIFE and
> @@ -3281,18 +3380,22 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
>      }
>  }
>  
> -/* Check if vectorization of the basic block is profitable.  */
> +/* Check if vectorization of the basic block is profitable for the
> +   subgraph denoted by SLP_INSTANCES.  */
>  
>  static bool
> -vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
> +vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
> +                                 vec<slp_instance> slp_instances)
>  {
> -  vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
>    slp_instance instance;
>    int i;
>    unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
>    unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
>  
> -  /* Calculate scalar cost.  */
> +  void *vect_target_cost_data = init_cost (NULL);
> +
> +  /* Calculate scalar cost and sum the cost for the vector stmts
> +     previously collected.  */
>    stmt_vector_for_cost scalar_costs;
>    scalar_costs.create (0);
>    hash_set<slp_tree> visited;
> @@ -3304,22 +3407,25 @@ vect_bb_vectorization_profitable_p (bb_vec_info 
> bb_vinfo)
>        vect_bb_slp_scalar_cost (bb_vinfo,
>                              SLP_INSTANCE_TREE (instance),
>                              &life, &scalar_costs, visited);
> +      add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
> +      instance->cost_vec.release ();
>      }
>    /* Unset visited flag.  */
>    stmt_info_for_cost *si;
>    FOR_EACH_VEC_ELT (scalar_costs, i, si)
>      gimple_set_visited  (si->stmt_info->stmt, false);
>  
> -  void *target_cost_data = init_cost (NULL);
> -  add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
> +  void *scalar_target_cost_data = init_cost (NULL);
> +  add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
>    scalar_costs.release ();
>    unsigned dummy;
> -  finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
> -  destroy_cost_data (target_cost_data);
> +  finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
> +  destroy_cost_data (scalar_target_cost_data);
>  
> -  /* Complete the target-specific cost calculation.  */
> -  finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
> +  /* Complete the target-specific vector cost calculation.  */
> +  finish_cost (vect_target_cost_data, &vec_prologue_cost,
>              &vec_inside_cost, &vec_epilogue_cost);
> +  destroy_cost_data (vect_target_cost_data);
>  
>    vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
>  
> @@ -3344,6 +3450,54 @@ vect_bb_vectorization_profitable_p (bb_vec_info 
> bb_vinfo)
>    return true;
>  }
>  
> +/* For each SLP subgraph determine profitability and remove parts not so.
> +   Returns true if any profitable to vectorize subgraph remains.  */
> +
> +static bool
> +vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
> +{
> +  slp_instance instance;
> +  unsigned i;
> +
> +  auto_vec<slp_instance> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length 
> ());
> +  FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
> +    if (!instance->subgraph_entries.is_empty ())
> +      subgraphs.quick_push (instance);
> +  BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0);
> +  for (i = 0; i < subgraphs.length ();)
> +    {
> +      instance = subgraphs[i];
> +      if (!vect_bb_vectorization_profitable_p (bb_vinfo,
> +                                            instance->subgraph_entries))
> +     {
> +       /* ???  We need to think of providing better dump/opt-report
> +          locations here.  */
> +       if (dump_enabled_p ())
> +         {
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "not vectorized: vectorization is not "
> +                            "profitable.\n");
> +         }
> +       slp_instance entry;
> +       unsigned j;
> +       FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
> +         if (entry != instance)
> +           vect_free_slp_instance (entry, false);
> +       vect_free_slp_instance (instance, false);
> +       subgraphs.ordered_remove (i);
> +     }
> +      else
> +     {
> +       slp_instance entry;
> +       unsigned j;
> +       FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
> +         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry);
> +       ++i;
> +     }
> +    }
> +  return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty ();
> +}
> +
>  /* Find any vectorizable constructors and add them to the grouped_store
>     array.  */
>  
> @@ -3721,6 +3875,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int 
> n_stmts, bool &fatal,
>        return false;
>      }
>  
> +  vect_bb_partition_graph (bb_vinfo);
> +
>    /* Cost model: check if the vectorization is worthwhile.  */
>    if (!unlimited_cost_model (NULL)
>        && !vect_bb_vectorization_profitable_p (bb_vinfo))
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index e555f465421..41a5cdbb26b 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -183,6 +183,13 @@ public:
>  
>    /* The SLP node containing the reduction PHIs.  */
>    slp_tree reduc_phis;
> +
> +  /* Vector cost of this entry to the SLP graph.  */
> +  stmt_vector_for_cost cost_vec;
> +
> +  /* If this instance is the main entry of a subgraph the set of
> +     entries into the same subgraph, including itself.  */
> +  vec<_slp_instance *> subgraph_entries;
>  } *slp_instance;
>  
>  
> @@ -915,7 +922,6 @@ public:
>  #define BB_VINFO_SLP_INSTANCES(B)    (B)->slp_instances
>  #define BB_VINFO_DATAREFS(B)         (B)->shared->datarefs
>  #define BB_VINFO_DDRS(B)             (B)->shared->ddrs
> -#define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data
>  
>  static inline bb_vec_info
>  vec_info_for_bb (basic_block bb)
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

Reply via email to