This re-implements scrapping of SLP in case load/store-lanes is
possible to be used as re-discovering the SLP instance with
forced single-lane splits so SLP load/store-lanes scheme can be
used.

This is now done after SLP discovery and SLP pattern recog are
complete to not disturb the latter but per SLP instance instead
of being a global decision on the whole loop.

        * tree-vect-slp.cc (vect_analyze_slp): After SLP pattern
        recog is finished see if there are any SLP instances
        that would benefit from using load/store-lanes and
        re-discover those with forced single lanes.
---
 gcc/tree-vect-slp.cc | 117 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 108 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bfecbc87f29..852937cdc34 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3481,7 +3481,8 @@ static bool
 vect_analyze_slp_instance (vec_info *vinfo,
                           scalar_stmts_to_slp_tree_map_t *bst_map,
                           stmt_vec_info stmt_info, slp_instance_kind kind,
-                          unsigned max_tree_size, unsigned *limit);
+                          unsigned max_tree_size, unsigned *limit,
+                          bool force_single_lane = false);
 
 /* Build an interleaving scheme for the store sources RHS_NODES from
    SCALAR_STMTS.  */
@@ -3676,7 +3677,8 @@ vect_build_slp_instance (vec_info *vinfo,
                         unsigned max_tree_size, unsigned *limit,
                         scalar_stmts_to_slp_tree_map_t *bst_map,
                         /* ???  We need stmt_info for group splitting.  */
-                        stmt_vec_info stmt_info_)
+                        stmt_vec_info stmt_info_,
+                        bool force_single_lane = false)
 {
   /* If there's no budget left bail out early.  */
   if (*limit == 0)
@@ -3705,9 +3707,17 @@ vect_build_slp_instance (vec_info *vinfo,
   poly_uint64 max_nunits = 1;
   unsigned tree_size = 0;
   unsigned i;
-  slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
-                                      &max_nunits, matches, limit,
-                                      &tree_size, bst_map);
+
+  slp_tree node = NULL;
+  if (force_single_lane)
+    {
+      matches[0] = true;
+      matches[1] = false;
+    }
+  else
+    node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+                               &max_nunits, matches, limit,
+                               &tree_size, bst_map);
   if (node != NULL)
     {
       /* Calculate the unrolling factor based on the smallest type.  */
@@ -3922,7 +3932,7 @@ vect_build_slp_instance (vec_info *vinfo,
               && compare_step_with_zero (vinfo, stmt_info) > 0
               && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
                                                 group_size, 1));
-         if (want_store_lanes)
+         if (want_store_lanes || force_single_lane)
            i = 1;
 
          if (dump_enabled_p ())
@@ -3958,7 +3968,7 @@ vect_build_slp_instance (vec_info *vinfo,
                                               (max_nunits, end - start));
                  rhs_nodes.safe_push (node);
                  start = end;
-                 if (want_store_lanes)
+                 if (want_store_lanes || force_single_lane)
                    end = start + 1;
                  else
                    end = group_size;
@@ -4086,7 +4096,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
                           scalar_stmts_to_slp_tree_map_t *bst_map,
                           stmt_vec_info stmt_info,
                           slp_instance_kind kind,
-                          unsigned max_tree_size, unsigned *limit)
+                          unsigned max_tree_size, unsigned *limit,
+                          bool force_single_lane)
 {
   vec<stmt_vec_info> scalar_stmts;
 
@@ -4131,7 +4142,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
                                      roots, remain,
                                      max_tree_size, limit, bst_map,
                                      kind == slp_inst_kind_store
-                                     ? stmt_info : NULL);
+                                     ? stmt_info : NULL, force_single_lane);
 
   /* ???  If this is slp_inst_kind_store and the above succeeded here's
      where we should do store group splitting.  */
@@ -4662,6 +4673,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
        }
     }
 
+  /* Check whether we should force some SLP instances to use load/store-lanes
+     and do so by forcing SLP re-discovery with single lanes.  We used
+     to cancel SLP when this applied to all instances in a loop but now
+     we decide this per SLP instance.  It's important to do this only
+     after SLP pattern recognition.  */
+  if (is_a <loop_vec_info> (vinfo))
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+      if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+         && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
+       {
+         slp_tree slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+         auto_vec<slp_tree> loads;
+         hash_set<slp_tree> visited;
+         vect_gather_slp_loads (loads, slp_root, visited);
+
+         /* Check whether any load in the SLP instance is possibly
+            permuted.  */
+         bool loads_permuted = false;
+         slp_tree load_node;
+         unsigned j;
+         FOR_EACH_VEC_ELT (loads, j, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned k;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can use load/store-lanes force re-discovery
+            with single lanes.  */
+         if (loads_permuted
+             && !slp_root->ldst_lanes
+             && vect_store_lanes_supported (vectype, group_size, false)
+             != IFN_LAST)
+           {
+             bool can_use_lanes = true;
+             FOR_EACH_VEC_ELT (loads, j, load_node)
+               if (STMT_VINFO_GROUPED_ACCESS
+                     (SLP_TREE_REPRESENTATIVE (load_node)))
+                 {
+                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                       (SLP_TREE_REPRESENTATIVE (load_node));
+                   /* Use SLP for strided accesses (or if we can't
+                      load-lanes).  */
+                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                       || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
+                       || vect_load_lanes_supported
+                            (STMT_VINFO_VECTYPE (stmt_vinfo),
+                             DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+                     {
+                       can_use_lanes = false;
+                       break;
+                     }
+                 }
+
+             if (can_use_lanes)
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "SLP instance %p can use load/store-lanes,"
+                                    " re-discovering with single-lanes\n",
+                                    (void *) instance);
+
+                 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
+
+                 vect_free_slp_instance (instance);
+                 limit = max_tree_size;
+                 bool res = vect_analyze_slp_instance (vinfo, bst_map,
+                                                       stmt_info,
+                                                       slp_inst_kind_store,
+                                                       max_tree_size, &limit,
+                                                       true);
+                 gcc_assert (res);
+                 auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
+                 LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
+               }
+           }
+       }
+
   /* When we end up with load permutations that we cannot possibly handle,
      like those requiring three vector inputs, lower them using interleaving
      like schemes.  */
-- 
2.43.0

Reply via email to