This is the second merge proposed from the SLP vectorizer branch.
I have again managed without adding and using --param vect-single-lane-slp
but instead this provides always enabled functionality.

This makes us use SLP reductions (a group of reductions) for the
case where the group size is one.  This basically means we try
to use SLP for all reductions.

I've kept the series close to changes how they are on the branch
but in the end I'll squash it, having separate commits for review
eventually helps identifying common issues we will run into.  In
particular we lack full SLP support for several reduction kinds
and the branch has more enabling patches than in this series.
For example 4/5 makes sure we use shifts and direct opcode
reductions in the reduction epilog for SLP reductions but doesn't
bother to try covering the general case but enables it only
for the single-element group case to avoid regressions
in gcc.dg/vect/reduc-{mul,or}_[12].c testcases.

Bootstrapped and tested on x86_64-unknown-linux-gnu, I've also
successfully built SPEC CPU 2017.  This posting should trigger
arm & riscv pre-checkin CI.

There's one ICE in gcc.target/i386/pr51235.c I discovered late
that I will investigate and address after the weekend.

This change should be more straight-forward than the previous one,
still comments are of course welcome.  After pushed I will followup
with changes to enable single-lane SLP reductions for various
COND_EXPR reductions as well as double-reduction support and
in-order reduction support (also all restricted to single-lane
for the moment).

Thanks,
Richard.

--

The following performs single-lane SLP discovery for reductions.
This exposes a latent issue with reduction SLP in outer loop
vectorization and makes gcc.dg/vect/vect-outer-4[fgkl].c FAIL
execution.

        * tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane
        discoveries are reduction chains and need special backedge
        treatment.
        (vect_analyze_slp): Fall back to single-lane SLP discovery
        for reductions.  Make sure to try single-lane SLP reduction
        for all reductions as fallback.
---
 gcc/tree-vect-slp.cc | 71 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 17 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c7ed520b629..73cc69d85ce 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1907,7 +1907,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
            /* Reduction chain backedge defs are filled manually.
               ???  Need a better way to identify a SLP reduction chain PHI.
               Or a better overall way to SLP match those.  */
-           if (all_same && def_type == vect_reduction_def)
+           if (stmts.length () > 1
+               && all_same && def_type == vect_reduction_def)
              skip_args[loop_latch_edge (loop)->dest_idx] = true;
          }
        else if (def_type != vect_internal_def)
@@ -3905,9 +3906,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
          }
 
       /* Find SLP sequences starting from groups of reductions.  */
-      if (loop_vinfo->reductions.length () > 1)
+      if (loop_vinfo->reductions.length () > 0)
        {
-         /* Collect reduction statements.  */
+         /* Collect reduction statements we can combine into
+            a SLP reduction.  */
          vec<stmt_vec_info> scalar_stmts;
          scalar_stmts.create (loop_vinfo->reductions.length ());
          for (auto next_info : loop_vinfo->reductions)
@@ -3920,25 +3922,60 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
                     reduction path.  In that case we'd have to reverse
                     engineer that conversion stmt following the chain using
                     reduc_idx and from the PHI using reduc_def.  */
-                 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
-                 /* Do not discover SLP reductions for lane-reducing ops, that
-                    will fail later.  */
-                 && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
+                 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+               {
+                 /* Do not discover SLP reductions combining lane-reducing
+                    ops, that will fail later.  */
+                 if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
                      || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR
                          && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR
-                         && gimple_assign_rhs_code (g) != SAD_EXPR)))
-               scalar_stmts.quick_push (next_info);
+                         && gimple_assign_rhs_code (g) != SAD_EXPR))
+                   scalar_stmts.quick_push (next_info);
+                 else
+                   {
+                     /* Do SLP discovery for single-lane reductions.  */
+                     vec<stmt_vec_info> stmts;
+                     vec<stmt_vec_info> roots = vNULL;
+                     vec<tree> remain = vNULL;
+                     stmts.create (1);
+                     stmts.quick_push (next_info);
+                     vect_build_slp_instance (vinfo,
+                                              slp_inst_kind_reduc_group,
+                                              stmts, roots, remain,
+                                              max_tree_size, &limit,
+                                              bst_map, NULL);
+                   }
+               }
            }
-         if (scalar_stmts.length () > 1)
+         /* Save for re-processing on failure.  */
+         vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
+         vec<stmt_vec_info> roots = vNULL;
+         vec<tree> remain = vNULL;
+         if (scalar_stmts.length () <= 1
+             || !vect_build_slp_instance (loop_vinfo,
+                                          slp_inst_kind_reduc_group,
+                                          scalar_stmts, roots, remain,
+                                          max_tree_size, &limit, bst_map,
+                                          NULL))
            {
-             vec<stmt_vec_info> roots = vNULL;
-             vec<tree> remain = vNULL;
-             vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
-                                      scalar_stmts, roots, remain,
-                                      max_tree_size, &limit, bst_map, NULL);
+             if (scalar_stmts.length () <= 1)
+               scalar_stmts.release ();
+             /* Do SLP discovery for single-lane reductions.  */
+             for (auto stmt_info : saved_stmts)
+               {
+                 vec<stmt_vec_info> stmts;
+                 vec<stmt_vec_info> roots = vNULL;
+                 vec<tree> remain = vNULL;
+                 stmts.create (1);
+                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+                 vect_build_slp_instance (vinfo,
+                                          slp_inst_kind_reduc_group,
+                                          stmts, roots, remain,
+                                          max_tree_size, &limit,
+                                          bst_map, NULL);
+               }
+             saved_stmts.release ();
            }
-         else
-           scalar_stmts.release ();
        }
     }
 
-- 
2.35.3

Reply via email to