Hi Richi, We decided to take the regression in any code-gen this could give and fix it properly next stage-1. As such here's a new patch based on your previous feedback.
Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-vect-slp.c (vect_analyze_slp_instance): Moved load/store lanes check to ... * tree-vect-loop.c (vect_analyze_loop_2): ..Here gcc/testsuite/ChangeLog: * gcc.dg/vect/slp-11b.c: Update output scan. * gcc.dg/vect/slp-perm-6.c: Likewise. > -----Original Message----- > From: rguent...@c653.arch.suse.de <rguent...@c653.arch.suse.de> On > Behalf Of Richard Biener > Sent: Thursday, October 22, 2020 9:44 AM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; o...@ucw.cz > Subject: Re: [PATCH] SLP: Move load/store-lanes check till late > > On Wed, 21 Oct 2020, Tamar Christina wrote: > > > Hi All, > > > > This moves the code that checks for load/store lanes further in the > > pipeline and places it after slp_optimize. This would allow us to > > perform optimizations on the SLP tree and only bail out if we really have a > permute. > > > > With this change it allows us to handle permutes such as {1,1,1,1} > > which should be handled by a load and replicate. > > > > This change however makes it all or nothing. Either all instances can > > be handled or none at all. This is why some of the test cases have been > adjusted. > > So this possibly leaves a loop unvectorized in case there's a ldN/stN > opportunity but another SLP instance with a permutation not handled by > interleaving is present. What I was originally suggesting is to only cancel > the > SLP build if _all_ instances can be handled with ldN/stN. > > Of course I'm also happy with completely removing this heuristics. > > Note some of the comments look off now, also the assignment to ok before > the goto is pointless and you should probably turn this into a dump print > instead. > > Thanks, > Richard. > > > Bootstrapped Regtested on aarch64-none-linux-gnu, -x86_64-pc-linux-gnu > > and no issues. > > > > Ok for master? > > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * tree-vect-slp.c (vect_analyze_slp_instance): Moved load/store > lanes > > check to ... > > * tree-vect-loop.c (vect_analyze_loop_2): ..Here > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/vect/slp-11b.c: Update output scan. > > * gcc.dg/vect/slp-perm-6.c: Likewise. > > > > > > -- > Richard Biener <rguent...@suse.de> > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 > Nuernberg, Germany; GF: Felix Imend
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c index 0cc23770badf0e00ef98769a2dd14a92dca32cca..fe5bb0c3ce7682c7cef1313e342d95aba3fe11b2 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-11b.c +++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c @@ -45,4 +45,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided4 && vect_int_mult } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided4 && vect_int_mult } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "re-trying with SLP disabled" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c index 38489291a2659c989121d44c9e9e7bdfaa12f868..07bf8916de7ce88bbb1d65437f8bf6d8ab17efe6 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c @@ -106,7 +106,7 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && {! vect_partial_vectors_usage_1 } } } } } } */ /* The epilogues are vectorized using partial vectors. */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && vect_partial_vectors_usage_1 } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 6fa185daa2836062814f9c9a6659011a3153c6a2..56873b93ef9905ff76929f471de4d32559268304 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2365,6 +2365,78 @@ start_over: "unsupported SLP instances\n"); goto again; } + + /* Check whether any load in ALL SLP instances is possibly permuted. */ + slp_tree load_node, slp_root; + unsigned i, x; + slp_instance instance; + bool can_use_lanes = true; + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) + { + slp_root = SLP_INSTANCE_TREE (instance); + int group_size = SLP_TREE_LANES (slp_root); + tree vectype = SLP_TREE_VECTYPE (slp_root); + bool loads_permuted = false; + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) + { + if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) + continue; + unsigned j; + stmt_vec_info load_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) + if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) + { + loads_permuted = true; + break; + } + } + + /* If the loads and stores can be handled with load/store-lane + instructions record it and move on to the next instance. */ + if (loads_permuted + && vect_store_lanes_supported (vectype, group_size, false)) + { + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) + { + stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT + (SLP_TREE_SCALAR_STMTS (load_node)[0]); + /* Use SLP for strided accesses (or if we can't + load-lanes). */ + if (STMT_VINFO_STRIDED_P (stmt_vinfo) + || ! vect_load_lanes_supported + (STMT_VINFO_VECTYPE (stmt_vinfo), + DR_GROUP_SIZE (stmt_vinfo), false)) + break; + } + + can_use_lanes + = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); + + if (can_use_lanes && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP instance %p can use load/store-lanes\n", + instance); + } + else + { + can_use_lanes = false; + break; + } + } + + /* If all SLP instances can use load/store-lanes abort SLP and try again + with SLP disabled. */ + if (can_use_lanes) + { + ok = opt_result::failure_at (vect_location, + "Built SLP cancelled: can use " + "load/store-lanes\n"); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Built SLP cancelled: all SLP instances support " + "load/store-lanes\n"); + goto again; + } } /* Dissolve SLP-only groups. */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index e97fbe897a76008d50ee94c3b1b009344cc37d4a..76382ac16387ea50b27d0bcce57e664898434498 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -208,7 +208,7 @@ typedef struct _slp_oprnd_info /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each operand. */ -static vec<slp_oprnd_info> +static vec<slp_oprnd_info> vect_create_oprnd_info (int nops, int group_size) { int i; @@ -1096,7 +1096,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, { if (dump_enabled_p ()) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Build SLP failed: different operation " "in stmt %G", stmt); dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -2263,56 +2263,11 @@ vect_build_slp_instance (vec_info *vinfo, "SLP size %u vs. limit %u.\n", tree_size, max_tree_size); - /* Check whether any load is possibly permuted. */ - slp_tree load_node; - bool loads_permuted = false; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) - { - if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) - continue; - unsigned j; - stmt_vec_info load_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) - if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) - { - loads_permuted = true; - break; - } - } - - /* If the loads and stores can be handled with load/store-lane - instructions do not generate this SLP instance. */ - if (is_a <loop_vec_info> (vinfo) - && loads_permuted - && kind == slp_inst_kind_store - && vect_store_lanes_supported - (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false)) - { - slp_tree load_node; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) - { - stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_SCALAR_STMTS (load_node)[0]); - /* Use SLP for strided accesses (or if we can't load-lanes). */ - if (STMT_VINFO_STRIDED_P (stmt_vinfo) - || ! vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), - DR_GROUP_SIZE (stmt_vinfo), false)) - break; - } - if (i == SLP_INSTANCE_LOADS (new_instance).length ()) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Built SLP cancelled: can use " - "load/store-lanes\n"); - vect_free_slp_instance (new_instance); - return false; - } - } - - /* Fixup SLP reduction chains. */ - if (kind == slp_inst_kind_reduc_chain) + /* If this is a reduction chain with a conversion in front + amend the SLP tree with a node for that. */ + if (!dr + && REDUC_GROUP_FIRST_ELEMENT (stmt_info) + && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) { /* If this is a reduction chain with a conversion in front amend the SLP tree with a node for that. */ @@ -3877,7 +3832,7 @@ vect_bb_partition_graph (bb_vec_info bb_vinfo) and return it. Do not account defs that are marked in LIFE and update LIFE according to uses of NODE. */ -static void +static void vect_bb_slp_scalar_cost (vec_info *vinfo, slp_tree node, vec<bool, va_heap> *life, stmt_vector_for_cost *cost_vec, @@ -3888,7 +3843,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, slp_tree child; if (visited.add (node)) - return; + return; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) { @@ -4143,7 +4098,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, { dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Failed to SLP the basic block.\n"); - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "not vectorized: failed to find SLP opportunities " "in basic block.\n"); } @@ -5008,7 +4963,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, if (!analyze_only) { tree mask_vec = NULL_TREE; - + if (! noop_p) mask_vec = vect_gen_perm_mask_checked (vectype, indices);