On Wed, 19 Jun 2024, Richard Sandiford wrote:

> Richard Biener <rguent...@suse.de> writes:
> > We currently fail to re-CSE SLP nodes after optimizing permutes
> > which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
> > this shows in not re-using the SLP node with the load and arithmetic
> > for both the store and the reduction.  The following implements
> > CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
> >
> > I've tried to make the CSE part of permute materialization but it
> > isn't a very good fit there.  I've not bothered to implement something
> > more complete, also handling external defs or defs without
> > SLP_TREE_SCALAR_STMTS.
> >
> > I realize this might result in more BB SLP which in turn might slow
> > down code given costing for BB SLP is difficult (even that we now
> > vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
> > This is nevertheless feeding more accurate info to costing which is
> > good.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> >
> > Does this look OK?
> >
> > Thanks,
> > Richard.
> >
> >     PR tree-optimization/114413
> >     * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
> >     New function, split out from ...
> >     (vect_analyze_slp): ... here.  Call it.
> >     (vect_cse_slp_nodes): New function.
> >     (vect_optimize_slp): Call it.
> >
> >     * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
> > ---
> >  gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
> >  gcc/tree-vect-slp.cc                  | 77 ++++++++++++++++++++++-----
> >  2 files changed, 71 insertions(+), 12 deletions(-)
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c 
> > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> > index 4f72727b694..475b241c36e 100644
> > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> > @@ -38,3 +38,9 @@ int main()
> >      abort ();
> >    return 0;
> >  }
> > +
> > +/* This is a weak test but we want to re-use the arithmetic for both the
> > +   store and the reduction.  */
> > +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { 
> > x86_64-*-* i?86-*-* } } } } */
> > +/* On i386 we vectorize both the store and the reduction.  */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 
> > "slp2" { target { x86_64-*-* i?86-*-* } } } } */
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index 2552dacbd69..980d1e7267d 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type 
> > candidate)
> >    return true;
> >  }
> >  
> > +typedef hash_map <vec <stmt_vec_info>, slp_tree,
> > +             simple_hashmap_traits <bst_traits, slp_tree> >
> > +  scalar_stmts_to_slp_tree_map_t;
> > +
> > +/* Release BST_MAP.  */
> > +
> > +static void
> > +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t 
> > *bst_map)
> > +{
> > +  /* The map keeps a reference on SLP nodes built, release that.  */
> > +  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> > +       it != bst_map->end (); ++it)
> > +    if ((*it).second)
> > +      vect_free_slp_tree ((*it).second);
> > +  delete bst_map;
> > +}
> > +
> >  /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
> >     but then vec::insert does memmove and that's not compatible with
> >     std::pair.  */
> > @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
> >      }
> >  }
> >  
> > -typedef hash_map <vec <stmt_vec_info>, slp_tree,
> > -             simple_hashmap_traits <bst_traits, slp_tree> >
> > -  scalar_stmts_to_slp_tree_map_t;
> > -
> >  static slp_tree
> >  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
> >                    vec<stmt_vec_info> stmts, unsigned int group_size,
> > @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> > max_tree_size)
> >     }
> >      }
> >  
> > -
> > -
> > -  /* The map keeps a reference on SLP nodes built, release that.  */
> > -  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> > -       it != bst_map->end (); ++it)
> > -    if ((*it).second)
> > -      vect_free_slp_tree ((*it).second);
> > -  delete bst_map;
> > +  release_scalar_stmts_to_slp_tree_map (bst_map);
> >  
> >    if (pattern_found && dump_enabled_p ())
> >      {
> > @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run ()
> >    free_graph (m_slpg);
> >  }
> >  
> > +/* Apply CSE to NODE and its children using BST_MAP.  */
> > +
> > +static void
> > +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& 
> > node)
> > +{
> > +  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
> > +      && SLP_TREE_CODE (node) != VEC_PERM_EXPR
> > +      /* Besides some VEC_PERM_EXPR, two-operator nodes also
> > +    lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
> > +    we'd have sth that works for all internal and external nodes.  */
> > +      && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
> > +    {
> > +      if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
> > +   {
> > +     if (*leader != node)
> > +       {
> > +         if (dump_enabled_p ())
> > +           dump_printf_loc (MSG_NOTE, vect_location,
> > +                            "re-using SLP tree %p for %p\n",
> > +                            (void *)*leader, (void *)node);
> > +         vect_free_slp_tree (node);
> > +         (*leader)->refcnt += 1;
> > +         node = *leader;
> > +       }
> > +     return;
> > +   }
> > +
> > +      bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
> > +      node->refcnt += 1;
> 
> Since we're not starting with zeroed refcnts, won't this end up double
> counting the references?

This is for the reference bst_map holds, we're releasing that when
destroying bst_map.  I copied this from other bst-map populating
code (and release_scalar_stmts_to_slp_tree_map performs the
corresponding decrement).  I guess the way we operate here guarantees
refcnt stays at at least one but to re-use
release_scalar_stmts_to_slp_tree_map I have to increment.

> > +      /* And recurse.  */
> > +    }
> > +
> > +  for (slp_tree &child : SLP_TREE_CHILDREN (node))
> > +    if (child)
> > +      vect_cse_slp_nodes (bst_map, child);
> > +}
> > +
> >  /* Optimize the SLP graph of VINFO.  */
> >  
> >  void
> > @@ -6381,6 +6424,16 @@ vect_optimize_slp (vec_info *vinfo)
> >    if (vinfo->slp_instances.is_empty ())
> >      return;
> >    vect_optimize_slp_pass (vinfo).run ();
> > +
> > +
> 
> Super minor nit, but: one newline too many :)

Fixed ;)

I'll push tomorrow if the pre-checkin CI is happy.

Richard.

> LGTM otherwise FWIW.
> 
> Thanks,
> Richard
> 
> > +  /* Apply CSE again to nodes after permute optimization.  */
> > +  scalar_stmts_to_slp_tree_map_t *bst_map
> > +    = new scalar_stmts_to_slp_tree_map_t ();
> > +
> > +  for (auto inst : vinfo->slp_instances)
> > +    vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
> > +
> > +  release_scalar_stmts_to_slp_tree_map (bst_map);
> >  }
> >  
> >  /* Gather loads reachable from the individual SLP graph entries.  */
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to