This is the second merge proposed from the SLP vectorizer branch. I have again managed without adding and using --param vect-single-lane-slp but instead this provides always enabled functionality.
This makes us use SLP reductions (a group of reductions) for the case where the group size is one. This basically means we try to use SLP for all reductions. I've kept the series close to changes how they are on the branch but in the end I'll squash it, having separate commits for review eventually helps identifying common issues we will run into. In particular we lack full SLP support for several reduction kinds and the branch has more enabling patches than in this series. For example 4/5 makes sure we use shifts and direct opcode reductions in the reduction epilog for SLP reductions but doesn't bother to try covering the general case but enables it only for the single-element group case to avoid regressions in gcc.dg/vect/reduc-{mul,or}_[12].c testcases. Bootstrapped and tested on x86_64-unknown-linux-gnu, I've also successfully built SPEC CPU 2017. This posting should trigger arm & riscv pre-checkin CI. There's one ICE in gcc.target/i386/pr51235.c I discovered late that I will investigate and address after the weekend. This change should be more straight-forward than the previous one, still comments are of course welcome. After pushed I will followup with changes to enable single-lane SLP reductions for various COND_EXPR reductions as well as double-reduction support and in-order reduction support (also all restricted to single-lane for the moment). Thanks, Richard. -- The following performs single-lane SLP discovery for reductions. This exposes a latent issue with reduction SLP in outer loop vectorization and makes gcc.dg/vect/vect-outer-4[fgkl].c FAIL execution. * tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane discoveries are reduction chains and need special backedge treatment. (vect_analyze_slp): Fall back to single-lane SLP discovery for reductions. Make sure to try single-lane SLP reduction for all reductions as fallback. --- gcc/tree-vect-slp.cc | 71 +++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index c7ed520b629..73cc69d85ce 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1907,7 +1907,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, /* Reduction chain backedge defs are filled manually. ??? Need a better way to identify a SLP reduction chain PHI. Or a better overall way to SLP match those. */ - if (all_same && def_type == vect_reduction_def) + if (stmts.length () > 1 + && all_same && def_type == vect_reduction_def) skip_args[loop_latch_edge (loop)->dest_idx] = true; } else if (def_type != vect_internal_def) @@ -3905,9 +3906,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } /* Find SLP sequences starting from groups of reductions. */ - if (loop_vinfo->reductions.length () > 1) + if (loop_vinfo->reductions.length () > 0) { - /* Collect reduction statements. */ + /* Collect reduction statements we can combine into + a SLP reduction. */ vec<stmt_vec_info> scalar_stmts; scalar_stmts.create (loop_vinfo->reductions.length ()); for (auto next_info : loop_vinfo->reductions) @@ -3920,25 +3922,60 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) reduction path. In that case we'd have to reverse engineer that conversion stmt following the chain using reduc_idx and from the PHI using reduc_def. */ - && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def - /* Do not discover SLP reductions for lane-reducing ops, that - will fail later. */ - && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info))) + && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) + { + /* Do not discover SLP reductions combining lane-reducing + ops, that will fail later. */ + if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info))) || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR - && gimple_assign_rhs_code (g) != SAD_EXPR))) - scalar_stmts.quick_push (next_info); + && gimple_assign_rhs_code (g) != SAD_EXPR)) + scalar_stmts.quick_push (next_info); + else + { + /* Do SLP discovery for single-lane reductions. */ + vec<stmt_vec_info> stmts; + vec<stmt_vec_info> roots = vNULL; + vec<tree> remain = vNULL; + stmts.create (1); + stmts.quick_push (next_info); + vect_build_slp_instance (vinfo, + slp_inst_kind_reduc_group, + stmts, roots, remain, + max_tree_size, &limit, + bst_map, NULL); + } + } } - if (scalar_stmts.length () > 1) + /* Save for re-processing on failure. */ + vec<stmt_vec_info> saved_stmts = scalar_stmts.copy (); + vec<stmt_vec_info> roots = vNULL; + vec<tree> remain = vNULL; + if (scalar_stmts.length () <= 1 + || !vect_build_slp_instance (loop_vinfo, + slp_inst_kind_reduc_group, + scalar_stmts, roots, remain, + max_tree_size, &limit, bst_map, + NULL)) { - vec<stmt_vec_info> roots = vNULL; - vec<tree> remain = vNULL; - vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group, - scalar_stmts, roots, remain, - max_tree_size, &limit, bst_map, NULL); + if (scalar_stmts.length () <= 1) + scalar_stmts.release (); + /* Do SLP discovery for single-lane reductions. */ + for (auto stmt_info : saved_stmts) + { + vec<stmt_vec_info> stmts; + vec<stmt_vec_info> roots = vNULL; + vec<tree> remain = vNULL; + stmts.create (1); + stmts.quick_push (vect_stmt_to_vectorize (stmt_info)); + vect_build_slp_instance (vinfo, + slp_inst_kind_reduc_group, + stmts, roots, remain, + max_tree_size, &limit, + bst_map, NULL); + } + saved_stmts.release (); } - else - scalar_stmts.release (); } } -- 2.35.3