Hi Richard, > -----Original Message----- > From: Richard Biener [mailto:richard.guent...@gmail.com] > Sent: Tuesday, May 17, 2016 5:40 PM > To: Kumar, Venkataramanan <venkataramanan.ku...@amd.com> > Cc: gcc-patches@gcc.gnu.org > Subject: Re: [Patch V2] Fix SLP PR58135. > > On Tue, May 17, 2016 at 1:56 PM, Kumar, Venkataramanan > <venkataramanan.ku...@amd.com> wrote: > > Hi Richard, > > > > I created the patch by passing -b option to git. Now the patch is more > readable. > > > > As per your suggestion I tried to fix the PR by splitting the SLP store > > group at > vector boundary after the SLP tree is built. > > > > Boot strap PASSED on x86_64. > > Checked the patch with check_GNU_style.sh. > > > > The gfortran.dg/pr46519-1.f test now does SLP vectorization. Hence it > generated 2 more vzeroupper. > > As recommended I adjusted the test case by adding -fno-tree-slp-vectorize > to make it as expected after loop vectorization. > > > > The following tests are now passing. > > > > ------ Snip----- > > Tests that now work, but didn't before: > > > > gcc.dg/vect/bb-slp-19.c -flto -ffat-lto-objects scan-tree-dump-times > > slp2 "basic block vectorized" 1 > > > > gcc.dg/vect/bb-slp-19.c scan-tree-dump-times slp2 "basic block > > vectorized" 1 > > > > New tests that PASS: > > > > gcc.dg/vect/pr58135.c (test for excess errors) gcc.dg/vect/pr58135.c > > -flto -ffat-lto-objects (test for excess errors) > > > > ------ Snip----- > > > > ChangeLog > > > > 2016-05-14 Venkataramanan Kumar > <venkataramanan.ku...@amd.com> > > PR tree-optimization/58135 > > * tree-vect-slp.c: When group size is not multiple of vector size, > > allow splitting of store group at vector boundary. > > > > Test suite ChangeLog > > 2016-05-14 Venkataramanan Kumar > <venkataramanan.ku...@amd.com> > > * gcc.dg/vect/bb-slp-19.c: Remove XFAIL. > > * gcc.dg/vect/pr58135.c: Add new. > > * gfortran.dg/pr46519-1.f: Adjust test case. > > > > The attached patch Ok for trunk? > > > Please avoid the excessive vertical space around the vect_build_slp_tree call. Yes fixed in the attached patch. > > + /* Calculate the unrolling factor. */ > + unrolling_factor = least_common_multiple > + (nunits, group_size) / group_size; > ... > + else > { > /* Calculate the unrolling factor based on the smallest type. */ > if (max_nunits > nunits) > - unrolling_factor = least_common_multiple (max_nunits, group_size) > - / group_size; > + unrolling_factor > + = least_common_multiple (max_nunits, > + group_size)/group_size; > > please compute the "correct" unroll factor immediately and move the > "unrolling of BB required" error into the if() case by post-poning the nunits > < > group_size check (and use max_nunits here). > Yes fixed in the attached patch.
> + if (is_a <bb_vec_info> (vinfo) > + && nunits < group_size > + && unrolling_factor != 1 > + && is_a <bb_vec_info> (vinfo)) > + { > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Build SLP failed: store group " > + "size not a multiple of the vector size " > + "in basic block SLP\n"); > + /* Fatal mismatch. */ > + matches[nunits] = false; > > this is too pessimistic - you want to add the extra 'false' at group_size / > max_nunits * max_nunits. Yes fixed in attached patch. > > It looks like you leak 'node' in the if () path as well. You need > > vect_free_slp_tree (node); > loads.release (); > > thus treat it as a failure case. Yes fixed. I added an else part before scalar_stmts.release call for the case when SLP tree is not built. This avoids double freeing. Bootstrapped and reg tested on X86_64. Ok for trunk ? > > Thanks, > Richard. > > > Regards, > > Venkat. > > Regards, Venkat.
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-19.c b/gcc/testsuite/gcc.dg/vect/bb-slp-19.c index 42cd294..c282155 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-19.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-19.c @@ -53,5 +53,5 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr58135.c b/gcc/testsuite/gcc.dg/vect/pr58135.c new file mode 100644 index 0000000..ca25000 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr58135.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +int a[100]; +void foo () +{ + a[0] = a[1] = a[2] = a[3] = a[4]= 0; +} + +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ diff --git a/gcc/testsuite/gfortran.dg/pr46519-1.f b/gcc/testsuite/gfortran.dg/pr46519-1.f index 51c64b8..46be9f5 100644 --- a/gcc/testsuite/gfortran.dg/pr46519-1.f +++ b/gcc/testsuite/gfortran.dg/pr46519-1.f @@ -1,5 +1,5 @@ ! { dg-do compile { target i?86-*-* x86_64-*-* } } -! { dg-options "-O3 -mavx -mvzeroupper -mtune=generic -dp" } +! { dg-options "-O3 -mavx -mvzeroupper -fno-tree-slp-vectorize -mtune=generic -dp" } PROGRAM MG3XDEMO INTEGER LM, NM, NV, NR, NIT diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index d713848..3babfcd 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1754,18 +1754,6 @@ vect_analyze_slp_instance (vec_info *vinfo, } nunits = TYPE_VECTOR_SUBPARTS (vectype); - /* Calculate the unrolling factor. */ - unrolling_factor = least_common_multiple (nunits, group_size) / group_size; - if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unrolling required in basic" - " block SLP\n"); - - return false; - } - /* Create a node (a root of the SLP tree) for the packed grouped stores. */ scalar_stmts.create (group_size); next = stmt; @@ -1801,26 +1789,44 @@ vect_analyze_slp_instance (vec_info *vinfo, /* Build the tree for the SLP instance. */ bool *matches = XALLOCAVEC (bool, group_size); unsigned npermutes = 0; - if ((node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, &max_nunits, &loads, matches, &npermutes, - NULL, max_tree_size)) != NULL) + NULL, max_tree_size); + if (node != NULL) { - /* Calculate the unrolling factor based on the smallest type. */ - if (max_nunits > nunits) - unrolling_factor = least_common_multiple (max_nunits, group_size) - / group_size; + /*Calculate the unrolling factor based on the smallest type. */ + unrolling_factor + = least_common_multiple (max_nunits, group_size)/group_size; - if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo)) + if (max_nunits > nunits + && unrolling_factor != 1 + && is_a <bb_vec_info> (vinfo)) { if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unrolling required in basic" - " block SLP\n"); + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "Build SLP failed: unrolling " + "required in basic block SLP\n"); vect_free_slp_tree (node); loads.release (); return false; } + if (is_a <bb_vec_info> (vinfo) + && max_nunits < group_size + && unrolling_factor != 1) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: store group " + "size not a multiple of the vector size " + "in basic block SLP\n"); + /* Fatal mismatch. */ + matches[group_size/max_nunits * max_nunits] = false; + vect_free_slp_tree (node); + loads.release (); + } + else + { /* Create a new SLP instance. */ new_instance = XNEW (struct _slp_instance); SLP_INSTANCE_TREE (new_instance) = node; @@ -1842,8 +1848,8 @@ vect_analyze_slp_instance (vec_info *vinfo, (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) { - int load_place - = vect_get_place_in_interleaving_chain (load, first_stmt); + int load_place = vect_get_place_in_interleaving_chain + (load, first_stmt); gcc_assert (load_place != -1); if (load_place != j) this_load_permuted = true; @@ -1873,7 +1879,8 @@ vect_analyze_slp_instance (vec_info *vinfo, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Build SLP failed: unsupported load " "permutation "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, + TDF_SLIM, stmt, 0); dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); } vect_free_slp_instance (new_instance); @@ -1881,7 +1888,7 @@ vect_analyze_slp_instance (vec_info *vinfo, } } - /* If the loads and stores can be handled with load/store-lane + /* If the loads and stores can be handled with load/store-lan instructions do not generate this SLP instance. */ if (is_a <loop_vec_info> (vinfo) && loads_permuted @@ -1893,7 +1900,8 @@ vect_analyze_slp_instance (vec_info *vinfo, gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt); - /* Use SLP for strided accesses (or if we can't load-lanes). */ + /* Use SLP for strided accesses (or if we + can't load-lanes). */ if (STMT_VINFO_STRIDED_P (stmt_vinfo) || ! vect_load_lanes_supported (STMT_VINFO_VECTYPE (stmt_vinfo), @@ -1922,11 +1930,14 @@ vect_analyze_slp_instance (vec_info *vinfo, return true; } - + } + else + { /* Failed to SLP. */ /* Free the allocated memory. */ scalar_stmts.release (); loads.release (); + } /* For basic block SLP, try to break the group up into multiples of the vector size. */