The following removes the code cancelling SLP if we can use load/store-lanes from the main loop vector analysis code and re-implements it as re-discovering the SLP instance with forced single-lane splits so SLP load/store-lanes scheme can be used.
This is now done after SLP discovery and SLP pattern recog are complete to not disturb the latter but per SLP instance instead of being a global decision on the whole loop. This is a behavioral change that for example shows in gcc.dg/vect/slp-perm-6.c on ARM where we formerly used SLP permutes but now a mix of SLP without permutes and load/store lanes. The previous flaky heuristic is now flaky in a different way. * tree-vect-loop.cc (vect_analyze_loop_2): Do not cancel SLP when store-lanes can be used. * tree-vect-slp.cc (vect_analyze_slp): After SLP pattern recog is finished see if there are any SLP instances that would benefit from using load/store-lanes and re-discover those with forced single lanes. * gcc.dg/vect/slp-perm-9.c: Remove expected SLP fail due to three-vector permute. * gcc.dg/vect/slp-perm-6.c: Remove XFAIL. * gcc.dg/vect/slp-perm-1.c: Adjust. * gcc.dg/vect/slp-perm-2.c: Likewise. * gcc.dg/vect/slp-perm-3.c: Likewise. * gcc.dg/vect/slp-perm-4.c: Likewise. * gcc.dg/vect/slp-perm-9.c: Likewise. --- gcc/testsuite/gcc.dg/vect/slp-perm-1.c | 5 +- gcc/testsuite/gcc.dg/vect/slp-perm-2.c | 4 +- gcc/testsuite/gcc.dg/vect/slp-perm-3.c | 4 +- gcc/testsuite/gcc.dg/vect/slp-perm-4.c | 2 +- gcc/testsuite/gcc.dg/vect/slp-perm-6.c | 4 +- gcc/testsuite/gcc.dg/vect/slp-perm-9.c | 2 - gcc/tree-vect-loop.cc | 76 ---------------- gcc/tree-vect-slp.cc | 117 +++++++++++++++++++++++-- 8 files changed, 115 insertions(+), 99 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c index dbb107f95fe..93b59075bce 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c @@ -81,9 +81,8 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && {! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump "can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c index 41fd159adce..6ac29e73122 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c @@ -55,8 +55,6 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c index 9ea35ba5afc..d1953054892 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c @@ -68,9 +68,7 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c index f4bda39c837..107968f1f7c 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c @@ -115,4 +115,4 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ /* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_load_lanes && vect_strided5 } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c index 5cc6261d69a..000848c587c 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c @@ -106,5 +106,5 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm3_int } } } */ /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */ -/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */ +/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c index 98f1d022226..c9468d81a9d 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c @@ -58,7 +58,5 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */ /* We don't try permutes with a group size of 3 for variable-length vectors. */ -/* { dg-final { scan-tree-dump "permutation requires at least three vectors" "vect" { target { vect_perm_short && { ! vect_perm3_short } } xfail vect_variable_length } } } */ -/* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_perm3_short || { vect32 || vect_load_lanes } } } } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short || { vect32 || vect_load_lanes } } } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 1fb7bbd4d25..242d5e2d916 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2958,82 +2958,6 @@ start_over: "unsupported SLP instances\n"); goto again; } - - /* Check whether any load in ALL SLP instances is possibly permuted. */ - slp_tree load_node, slp_root; - unsigned i, x; - slp_instance instance; - bool can_use_lanes = true; - FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) - { - slp_root = SLP_INSTANCE_TREE (instance); - int group_size = SLP_TREE_LANES (slp_root); - tree vectype = SLP_TREE_VECTYPE (slp_root); - bool loads_permuted = false; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) - { - if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) - continue; - unsigned j; - stmt_vec_info load_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) - if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) - { - loads_permuted = true; - break; - } - } - - /* If the loads and stores can be handled with load/store-lane - instructions record it and move on to the next instance. */ - if (loads_permuted - && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store - && vect_store_lanes_supported (vectype, group_size, false) - != IFN_LAST) - { - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) - if (STMT_VINFO_GROUPED_ACCESS - (SLP_TREE_REPRESENTATIVE (load_node))) - { - stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_REPRESENTATIVE (load_node)); - /* Use SLP for strided accesses (or if we can't - load-lanes). */ - if (STMT_VINFO_STRIDED_P (stmt_vinfo) - || vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), - DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) - break; - } - - can_use_lanes - = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); - - if (can_use_lanes && dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "SLP instance %p can use load/store-lanes\n", - (void *) instance); - } - else - { - can_use_lanes = false; - break; - } - } - - /* If all SLP instances can use load/store-lanes abort SLP and try again - with SLP disabled. */ - if (can_use_lanes) - { - ok = opt_result::failure_at (vect_location, - "Built SLP cancelled: can use " - "load/store-lanes\n"); - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Built SLP cancelled: all SLP instances support " - "load/store-lanes\n"); - goto again; - } } /* Dissolve SLP-only groups. */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index fe13f136552..5a65a99d61e 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3484,7 +3484,8 @@ static bool vect_analyze_slp_instance (vec_info *vinfo, scalar_stmts_to_slp_tree_map_t *bst_map, stmt_vec_info stmt_info, slp_instance_kind kind, - unsigned max_tree_size, unsigned *limit); + unsigned max_tree_size, unsigned *limit, + bool force_single_lane = false); /* Build an interleaving scheme for the store sources RHS_NODES from SCALAR_STMTS. */ @@ -3679,7 +3680,8 @@ vect_build_slp_instance (vec_info *vinfo, unsigned max_tree_size, unsigned *limit, scalar_stmts_to_slp_tree_map_t *bst_map, /* ??? We need stmt_info for group splitting. */ - stmt_vec_info stmt_info_) + stmt_vec_info stmt_info_, + bool force_single_lane = false) { /* If there's no budget left bail out early. */ if (*limit == 0) @@ -3708,9 +3710,17 @@ vect_build_slp_instance (vec_info *vinfo, poly_uint64 max_nunits = 1; unsigned tree_size = 0; unsigned i; - slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, - &max_nunits, matches, limit, - &tree_size, bst_map); + + slp_tree node = NULL; + if (force_single_lane) + { + matches[0] = true; + matches[1] = false; + } + else + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, limit, + &tree_size, bst_map); if (node != NULL) { /* Calculate the unrolling factor based on the smallest type. */ @@ -3925,7 +3935,7 @@ vect_build_slp_instance (vec_info *vinfo, && compare_step_with_zero (vinfo, stmt_info) > 0 && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, 1)); - if (want_store_lanes) + if (want_store_lanes || force_single_lane) i = 1; /* A fatal discovery fail doesn't always mean single-lane SLP @@ -3966,7 +3976,7 @@ vect_build_slp_instance (vec_info *vinfo, (max_nunits, end - start)); rhs_nodes.safe_push (node); start = end; - if (want_store_lanes) + if (want_store_lanes || force_single_lane) end = start + 1; else end = group_size; @@ -4094,7 +4104,8 @@ vect_analyze_slp_instance (vec_info *vinfo, scalar_stmts_to_slp_tree_map_t *bst_map, stmt_vec_info stmt_info, slp_instance_kind kind, - unsigned max_tree_size, unsigned *limit) + unsigned max_tree_size, unsigned *limit, + bool force_single_lane) { vec<stmt_vec_info> scalar_stmts; @@ -4139,7 +4150,7 @@ vect_analyze_slp_instance (vec_info *vinfo, roots, remain, max_tree_size, limit, bst_map, kind == slp_inst_kind_store - ? stmt_info : NULL); + ? stmt_info : NULL, force_single_lane); /* ??? If this is slp_inst_kind_store and the above succeeded here's where we should do store group splitting. */ @@ -4670,6 +4681,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } } + /* Check whether we should force some SLP instances to use load/store-lanes + and do so by forcing SLP re-discovery with single lanes. We used + to cancel SLP when this applied to all instances in a loop but now + we decide this per SLP instance. It's important to do this only + after SLP pattern recognition. */ + if (is_a <loop_vec_info> (vinfo)) + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) + if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store + && !SLP_INSTANCE_TREE (instance)->ldst_lanes) + { + slp_tree slp_root = SLP_INSTANCE_TREE (instance); + int group_size = SLP_TREE_LANES (slp_root); + tree vectype = SLP_TREE_VECTYPE (slp_root); + + auto_vec<slp_tree> loads; + hash_set<slp_tree> visited; + vect_gather_slp_loads (loads, slp_root, visited); + + /* Check whether any load in the SLP instance is possibly + permuted. */ + bool loads_permuted = false; + slp_tree load_node; + unsigned j; + FOR_EACH_VEC_ELT (loads, j, load_node) + { + if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) + continue; + unsigned k; + stmt_vec_info load_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info) + if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k) + { + loads_permuted = true; + break; + } + } + + /* If the loads and stores can use load/store-lanes force re-discovery + with single lanes. */ + if (loads_permuted + && !slp_root->ldst_lanes + && vect_store_lanes_supported (vectype, group_size, false) + != IFN_LAST) + { + bool can_use_lanes = true; + FOR_EACH_VEC_ELT (loads, j, load_node) + if (STMT_VINFO_GROUPED_ACCESS + (SLP_TREE_REPRESENTATIVE (load_node))) + { + stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT + (SLP_TREE_REPRESENTATIVE (load_node)); + /* Use SLP for strided accesses (or if we can't + load-lanes). */ + if (STMT_VINFO_STRIDED_P (stmt_vinfo) + || compare_step_with_zero (vinfo, stmt_vinfo) <= 0 + || vect_load_lanes_supported + (STMT_VINFO_VECTYPE (stmt_vinfo), + DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) + { + can_use_lanes = false; + break; + } + } + + if (can_use_lanes) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP instance %p can use load/store-lanes," + " re-discovering with single-lanes\n", + (void *) instance); + + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root); + + vect_free_slp_instance (instance); + limit = max_tree_size; + bool res = vect_analyze_slp_instance (vinfo, bst_map, + stmt_info, + slp_inst_kind_store, + max_tree_size, &limit, + true); + gcc_assert (res); + auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop (); + LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst; + } + } + } + /* When we end up with load permutations that we cannot possibly handle, like those requiring three vector inputs, lower them using interleaving like schemes. */ -- 2.43.0