From: Juzhe-Zhong <juzhe.zh...@rivai.ai>

This patch implement decrement IV for length approach in loop control.

Address comment from kewen that incorporate the implementation inside
"vect_set_loop_controls_directly" instead of a standalone function.

Address comment from Richard using MIN_EXPR to handle these 3 following
cases
1. single rgroup.
2. multiple rgroup for SLP.
3. multiple rgroup for non-SLP (tested on vec_pack_trunc).

Bootstraped && Regression on x86.

Ok for trunk ?

gcc/ChangeLog:

        * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
        (vect_set_loop_controls_directly): Add decrement IV support.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new 
variable.
        (vect_get_loop_len): Add decrement IV support.
        * tree-vect-stmts.cc (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
        (vect_get_loop_len): Add decrement IV support.

---
 gcc/tree-vect-loop-manip.cc | 177 +++++++++++++++++++++++++++++++++++-
 gcc/tree-vect-loop.cc       |  38 +++++++-
 gcc/tree-vect-stmts.cc      |   9 +-
 gcc/tree-vectorizer.h       |  13 ++-
 4 files changed, 224 insertions(+), 13 deletions(-)

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..1baac7b1b52 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,58 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, 
rgroup_controls *dest_rgm,
   return false;
 }
 
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+     _36 = MIN_EXPR <ivtmp_34, VF>;
+
+     First length (MIN (X, VF/N)):
+       loop_len_15 = MIN_EXPR <_36, VF/N>;
+
+     Second length:
+       tmp = _36 - loop_len_15;
+       loop_len_16 = MIN (tmp, VF/N);
+
+     Third length:
+       tmp2 = tmp - loop_len_16;
+       loop_len_17 = MIN (tmp2, VF/N);
+
+     Forth length:
+       tmp3 = tmp2 - loop_len_17;
+       loop_len_18 = MIN (tmp3, VF/N);  */
+
+static void
+vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls 
*dest_rgm,
+                      rgroup_controls *src_rgm)
+{
+  tree ctrl_type = dest_rgm->type;
+  poly_uint64 nitems_per_ctrl
+    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+
+  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+    {
+      tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
+      tree dest = dest_rgm->controls[i];
+      tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+      gassign *stmt;
+      if (i == 0)
+       {
+         /* MIN (X, VF*I/N) capped to the range [0, VF/N].  */
+         stmt = gimple_build_assign (dest, MIN_EXPR, src, length_limit);
+         gimple_seq_add_stmt (seq, stmt);
+       }
+      else
+       {
+         /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
+         tree temp = make_ssa_name (iv_type);
+         stmt = gimple_build_assign (temp, MINUS_EXPR, src,
+                                     dest_rgm->controls[i - 1]);
+         gimple_seq_add_stmt (seq, stmt);
+         stmt = gimple_build_assign (dest, MIN_EXPR, temp, length_limit);
+         gimple_seq_add_stmt (seq, stmt);
+       }
+    }
+}
+
 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
    for all the rgroup controls in RGC and return a control that is nonzero
    when the loop needs to iterate.  Add any new preheader statements to
@@ -468,9 +520,10 @@ vect_set_loop_controls_directly (class loop *loop, 
loop_vec_info loop_vinfo,
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
-            loop, &incr_gsi, insert_after, &index_before_incr,
-            &index_after_incr);
+  if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
+              loop, &incr_gsi, insert_after, &index_before_incr,
+              &index_after_incr);
 
   tree zero_index = build_int_cst (compare_type, 0);
   tree test_index, test_limit, first_limit;
@@ -552,8 +605,13 @@ vect_set_loop_controls_directly (class loop *loop, 
loop_vec_info loop_vinfo,
   /* Convert the IV value to the comparison type (either a no-op or
      a demotion).  */
   gimple_seq test_seq = NULL;
-  test_index = gimple_convert (&test_seq, compare_type, test_index);
-  gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    test_limit = gimple_convert (preheader_seq, iv_type, nitems_total);
+  else
+    {
+      test_index = gimple_convert (&test_seq, compare_type, test_index);
+      gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+    }
 
   /* Provide a definition of each control in the group.  */
   tree next_ctrl = NULL_TREE;
@@ -587,6 +645,101 @@ vect_set_loop_controls_directly (class loop *loop, 
loop_vec_info loop_vinfo,
                                          bias_tree);
        }
 
+      if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+       {
+         /* Create decrement IV:
+
+            Case 1 (single rgroup):
+               ...
+               _10 = (unsigned long) count_12(D);
+               ...
+               # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+               _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+               ...
+               vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+               ...
+               ivtmp_35 = ivtmp_9 - _36;
+               ...
+               if (ivtmp_35 != 0)
+                 goto <bb 4>; [83.33%]
+               else
+                 goto <bb 5>; [16.67%]
+
+            Case 2 (SLP multiple rgroup):
+               ...
+               _38 = (unsigned long) n_12(D);
+               _39 = _38 * 2;
+               _40 = MAX_EXPR <_39, 16>;
+               _41 = _40 + 18446744073709551600;
+               ...
+               # ivtmp_42 = PHI <ivtmp_43(4), _41(3)>
+               # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
+               ...
+               _44 = MIN_EXPR <ivtmp_42, 32>;
+               _47 = MIN_EXPR <ivtmp_45, 32>;
+               ...
+               .LEN_STORE (_6, 8B, _47, ...);
+               ...
+               .LEN_STORE (_25, 8B, _44, ...);
+               _33 = _47 / 2;
+               ...
+               .LEN_STORE (_8, 16B, _33, ...);
+               _36 = _44 / 2;
+               ...
+               .LEN_STORE (_15, 16B, _36, ...);
+               ivtmp_46 = ivtmp_45 - _47;
+               ivtmp_43 = ivtmp_42 - _44;
+               ...
+               if (ivtmp_46 != 0)
+                 goto <bb 4>; [83.33%]
+               else
+                 goto <bb 5>; [16.67%]
+
+            Case 3 (non-SLP multiple rgroup):
+               ...
+               _38 = (unsigned long) n_12(D);
+               ...
+               # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
+               ...
+               _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
+               loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
+               _41 = _40 - loop_len_21;
+               loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
+               _42 = _40 - loop_len_20;
+               loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
+               _43 = _40 - loop_len_19;
+               loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
+               ...
+               vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
+               ...
+               vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
+               ...
+               vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
+               ...
+               vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
+               vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
+               vect__7.13_32 = VEC_PACK_TRUNC_EXPR <vect__4.10_28, 
vect__4.11_30>;
+               vect__7.12_33 = VEC_PACK_TRUNC_EXPR <vect__7.13_31, 
vect__7.13_32>;
+               ...
+               .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
+               ivtmp_39 = ivtmp_38 - _40;
+               ...
+               if (ivtmp_39 != 0)
+                 goto <bb 3>; [92.31%]
+               else
+                 goto <bb 4>; [7.69%]
+         */
+         create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop,
+                    &incr_gsi, insert_after, &index_before_incr,
+                    &index_after_incr);
+         tree step = gimple_build (header_seq, MIN_EXPR, iv_type,
+                                   index_before_incr, nitems_step);
+         gassign *assign = gimple_build_assign (ctrl, step);
+         gimple_seq_add_stmt (header_seq, assign);
+         next_ctrl = index_after_incr;
+         continue;
+       }
+
       /* Create the initial control.  First include all items that
         are within the loop limit.  */
       tree init_ctrl = NULL_TREE;
@@ -704,6 +857,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
 
@@ -753,6 +907,19 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
              continue;
          }
 
+       if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+           && rgc->max_nscalars_per_iter == 1
+           && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+         {
+           rgroup_controls *sub_rgc
+             = &(*controls)[nmasks / rgc->controls.length () - 1];
+           if (!sub_rgc->controls.is_empty ())
+             {
+               vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
+               continue;
+             }
+         }
+
        /* See whether zero-based IV would ever generate all-false masks
           or zero length before wrapping around.  */
        bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ed0166fedab..44c0829a823 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -973,6 +973,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     vectorizable (false),
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
+    using_decrementing_iv_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2725,6 +2726,17 @@ start_over:
       && !vect_verify_loop_lens (loop_vinfo))
     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 
+  /* If we're vectorizing an loop that uses length "controls" and
+     can iterate more than once, we apply decrementing IV approach
+     in loop control.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+          && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
+          && LOOP_VINFO_INT_NITERS (loop_vinfo)
+               <= LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
+    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
@@ -10364,12 +10376,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, 
vec_loop_lens *lens,
    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
 
 tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-                  unsigned int nvectors, unsigned int index)
+vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+                  vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+                  unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
   bool use_bias_adjusted_len =
     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10400,6 +10414,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
vec_loop_lens *lens,
 
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+       {
+         /* A loop len for data type X can be reused for data type Y
+            if X has N times more elements than Y and if Y's elements
+            are N times bigger than X's.  */
+         gcc_assert (multiple_p (nunits1, nunits2));
+         unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+         gimple_seq seq = NULL;
+         loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+                                  build_int_cst (iv_type, factor));
+         if (seq)
+           gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+       }
+      return loop_len;
+    }
   else
     return rgl->controls[index];
 }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7313191b0db..b5e4bc59355 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8795,8 +8795,9 @@ vectorizable_store (vec_info *vinfo,
              else if (loop_lens)
                {
                  tree final_len
-                   = vect_get_loop_len (loop_vinfo, loop_lens,
-                                        vec_num * ncopies, vec_num * j + i);
+                   = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                        vec_num * ncopies, vectype,
+                                        vec_num * j + i);
                  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
                  machine_mode vmode = TYPE_MODE (vectype);
                  opt_machine_mode new_ovmode
@@ -10151,8 +10152,8 @@ vectorizable_load (vec_info *vinfo,
                    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
                      {
                        tree final_len
-                         = vect_get_loop_len (loop_vinfo, loop_lens,
-                                              vec_num * ncopies,
+                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                              vec_num * ncopies, vectype,
                                               vec_num * j + i);
                        tree ptr = build_int_cst (ref_type,
                                                  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9cf2fb23fe3..8af3b35324e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,13 @@ public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
 
+  /* True if we've decided to use a decrementing loop control IV that counts
+     scalars. This can be done for any loop that:
+
+       (a) uses length "controls"; and
+       (b) can iterate more than once.  */
+  bool using_decrementing_iv_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -890,6 +897,7 @@ public:
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             
\
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
@@ -2293,8 +2301,9 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, 
vec_loop_masks *,
                                unsigned int, tree, unsigned int);
 extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
                                  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
-                              unsigned int);
+extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
+                              vec_loop_lens *, unsigned int, tree,
+                              unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
 extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
-- 
2.36.1

Reply via email to