We currently fail to re-CSE SLP nodes after optimizing permutes which results in off cost estimates. For gcc.dg/vect/bb-slp-32.c this shows in not re-using the SLP node with the load and arithmetic for both the store and the reduction. The following implements CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
I've tried to make the CSE part of permute materialization but it isn't a very good fit there. I've not bothered to implement something more complete, also handling external defs or defs without SLP_TREE_SCALAR_STMTS. I realize this might result in more BB SLP which in turn might slow down code given costing for BB SLP is difficult (even that we now vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea). This is nevertheless feeding more accurate info to costing which is good. Bootstrapped and tested on x86_64-unknown-linux-gnu. Does this look OK? Thanks, Richard. PR tree-optimization/114413 * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map): New function, split out from ... (vect_analyze_slp): ... here. Call it. (vect_cse_slp_nodes): New function. (vect_optimize_slp): Call it. * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86. --- gcc/testsuite/gcc.dg/vect/bb-slp-32.c | 6 +++ gcc/tree-vect-slp.cc | 77 ++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c index 4f72727b694..475b241c36e 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c @@ -38,3 +38,9 @@ int main() abort (); return 0; } + +/* This is a weak test but we want to re-use the arithmetic for both the + store and the reduction. */ +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { x86_64-*-* i?86-*-* } } } } */ +/* On i386 we vectorize both the store and the reduction. */ +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 "slp2" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 2552dacbd69..980d1e7267d 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type candidate) return true; } +typedef hash_map <vec <stmt_vec_info>, slp_tree, + simple_hashmap_traits <bst_traits, slp_tree> > + scalar_stmts_to_slp_tree_map_t; + +/* Release BST_MAP. */ + +static void +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map) +{ + /* The map keeps a reference on SLP nodes built, release that. */ + for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); + it != bst_map->end (); ++it) + if ((*it).second) + vect_free_slp_tree ((*it).second); + delete bst_map; +} + /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree> but then vec::insert does memmove and that's not compatible with std::pair. */ @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo, } } -typedef hash_map <vec <stmt_vec_info>, slp_tree, - simple_hashmap_traits <bst_traits, slp_tree> > - scalar_stmts_to_slp_tree_map_t; - static slp_tree vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, vec<stmt_vec_info> stmts, unsigned int group_size, @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } } - - - /* The map keeps a reference on SLP nodes built, release that. */ - for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); - it != bst_map->end (); ++it) - if ((*it).second) - vect_free_slp_tree ((*it).second); - delete bst_map; + release_scalar_stmts_to_slp_tree_map (bst_map); if (pattern_found && dump_enabled_p ()) { @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run () free_graph (m_slpg); } +/* Apply CSE to NODE and its children using BST_MAP. */ + +static void +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node) +{ + if (SLP_TREE_DEF_TYPE (node) == vect_internal_def + && SLP_TREE_CODE (node) != VEC_PERM_EXPR + /* Besides some VEC_PERM_EXPR, two-operator nodes also + lack scalar stmts and thus CSE doesn't work via bst_map. Ideally + we'd have sth that works for all internal and external nodes. */ + && !SLP_TREE_SCALAR_STMTS (node).is_empty ()) + { + if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node))) + { + if (*leader != node) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "re-using SLP tree %p for %p\n", + (void *)*leader, (void *)node); + vect_free_slp_tree (node); + (*leader)->refcnt += 1; + node = *leader; + } + return; + } + + bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node); + node->refcnt += 1; + /* And recurse. */ + } + + for (slp_tree &child : SLP_TREE_CHILDREN (node)) + if (child) + vect_cse_slp_nodes (bst_map, child); +} + /* Optimize the SLP graph of VINFO. */ void @@ -6381,6 +6424,16 @@ vect_optimize_slp (vec_info *vinfo) if (vinfo->slp_instances.is_empty ()) return; vect_optimize_slp_pass (vinfo).run (); + + + /* Apply CSE again to nodes after permute optimization. */ + scalar_stmts_to_slp_tree_map_t *bst_map + = new scalar_stmts_to_slp_tree_map_t (); + + for (auto inst : vinfo->slp_instances) + vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst)); + + release_scalar_stmts_to_slp_tree_map (bst_map); } /* Gather loads reachable from the individual SLP graph entries. */ -- 2.35.3