Hi All,

This patch adds support for capping VF at runtime for VLA loops with a
data dependency.

Previously, no loop with a data dependency could be vectorized with VLA as we
made no assumption on the upper bound of vector length. This adds basic support
for this case (only for partial vectors with a full mask though).

The bump_vector_ptr logic is incorrect right now. I dont understand it
sufficiently yet.

Thoughts on this initial direction?

gcc/ChangeLog:

        * tree-vect-data-refs.cc (bump_vector_ptr): Add early return to stop
        ICE (incorrect logic though).
        * tree-vect-loop-manip.cc (vect_set_loop_invariant_group_controls): New
        function.
        (vect_set_loop_condition_normal): Set rgroup controls in capped case.
        (vect_set_loop_condition): Add logic to create capped VF ssa node.
        (vect_gen_vector_loop_niters): Add logic to peel in case of capped VF.
        (vect_do_peeling): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info):
        * tree-vect-stmts.cc (vect_get_data_ptr_increment):
        * tree-vectorizer.h (GCC_TREE_VECTORIZER_H):
        (LOOP_VINFO_RUNTIME_VF_CAP):
        (use_capped_vf):
        (vect_vf_for_cost):

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve_cap_1.c: New test.
        * gcc.target/aarch64/sve_cap_1_run.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/sve_cap_1.c  | 54 +++++++++++++
 .../gcc.target/aarch64/sve_cap_1_run.c        | 38 ++++++++++
 gcc/tree-vect-data-refs.cc                    |  2 +-
 gcc/tree-vect-loop-manip.cc                   | 76 ++++++++++++++++++-
 gcc/tree-vect-loop.cc                         | 11 +++
 gcc/tree-vect-stmts.cc                        | 12 +++
 gcc/tree-vectorizer.h                         | 25 +++++-
 7 files changed, 213 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c

diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
new file mode 100644
index 00000000000..31d97cd7619
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-inline -march=armv8-a+sve -fno-vect-cost-model 
-mautovec-preference=sve-only" } */
+
+#define LOOP(TYPE)                             \
+  void                                         \
+  f_##TYPE##_5 (TYPE *a, int n)                        \
+  {                                            \
+    for (long i = 0; i < n; ++i)                       \
+      a[i] += a[i - 5];                                \
+  } \
+  void                                         \
+  f_##TYPE##_10 (TYPE *a, int n)                       \
+  {                                            \
+    for (long i = 0; i < n; ++i)                       \
+      a[i] += a[i - 10];                               \
+  }
+
+LOOP (char)
+LOOP (short)
+LOOP (int)
+LOOP (float)
+LOOP (long)
+LOOP (double)
+
+// CHAR
+// { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.b, vl5} 1 } }
+// { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+\.b, xzr, x[0-9]+} 1 } 
}
+// { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-9]+, 
x[0-9]+\]} 4 } }
+// { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b} 2 } }
+// { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x[0-9]+, 
x[0-9]+\]} 2 } }
+
+// SHORT
+// { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h, vl5} 1 } }
+// { dg-final { scan-assembler-times {\tcnth\tx[0-9]+} 1 } }
+// { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+\.h, xzr, x[0-9]+} 1 } 
}
+// { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, 
x[0-9]+\, lsl 1]} 4 } }
+// { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, 
z[0-9]+\.h} 2 } }
+// { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x[0-9]+, 
x[0-9]+\, lsl 1]} 2 } }
+
+// INT/FLOAT
+// { dg-final { scan-assembler-times {\tcntw\tx[0-9]+} 4 } }
+// { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+\.s, xzr, x[0-9]+} 4 } 
}
+// { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
x[0-9]+\, lsl 2]} 8 } }
+// { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s} 2 } }
+// { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s} 2 } }
+// { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, 
x[0-9]+\, lsl 2]} 4 } }
+
+// INT/FLOAT
+// { dg-final { scan-assembler-times {\tcntd\tx[0-9]+} 4 } }
+// { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+\.d, xzr, x[0-9]+} 4 } 
}
+// { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
x[0-9]+\, lsl 3]} 8 } }
+// { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, z[0-9]+\.d, 
z[0-9]+\.d} 2 } }
+// { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d, z[0-9]+\.d} 2 } }
+// { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, 
x[0-9]+\, lsl 3]} 4 } }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c
new file mode 100644
index 00000000000..2072cfe61ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve 
-fno-vect-cost-model" } */
+
+#include "sve_cap_1.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+
+#define TEST_LOOP(TYPE, M)                     \
+  {                                            \
+    TYPE a[N + M];                             \
+    for (int i = 0; i < N + M; ++i)            \
+      a[i] = F (i);                            \
+    f_##TYPE##_##M (a + M, N);                 \
+    for (int i = 0; i < N; ++i)                        \
+      {                                                \
+       TYPE x = a[i];                          \
+       TYPE y = F (i + M);                     \
+       if (a[i + M] != (TYPE) (x + y))         \
+         __builtin_abort ();                   \
+      }                                                \
+  }
+
+#define TEST_LOOPS(TYPE) \
+  TEST_LOOP (TYPE, 5) \
+  TEST_LOOP (TYPE, 10)
+
+int
+main (void)
+{
+  TEST_LOOPS (char);
+  TEST_LOOPS (short);
+  TEST_LOOPS (int);
+  TEST_LOOPS (long);
+  TEST_LOOPS (float);
+  TEST_LOOPS (double);
+}
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 824b5f0f769..a1d198f37be 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -5858,7 +5858,7 @@ bump_vector_ptr (vec_info *vinfo,
       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
     }
 
-  if (!ptr_incr)
+  if (!ptr_incr || use_capped_vf (vinfo))
     return new_dataref_ptr;
 
   /* Update the vector-pointer's cross-iteration increment.  */
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 2d01a4b0ed1..99603f92034 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
+#include "tree-core.h"
 #include "tree.h"
 #include "gimple.h"
 #include "cfghooks.h"
@@ -1221,12 +1222,48 @@ vect_set_loop_condition_partial_vectors_avx512 (class 
loop *loop,
   return cond_stmt;
 }
 
+/* Helper for vect_set_loop_condition_normal.  Generate definitions
+   for all the rgroup controls in RGC for capped VF partial vectors.  */
+
+static void
+vect_set_loop_invariant_group_controls (gimple_seq *preheader_seq,
+                                       rgroup_controls *rgc, tree step)
+{
+  tree ctrl_type = rgc->type;
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+
+  tree nitems_step = step;
+
+  tree step_type = TREE_TYPE (step);
+
+  if (nitems_per_iter != 1)
+    {
+      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+        these multiplications don't overflow.  */
+      tree step_factor = build_int_cst (step_type, nitems_per_iter);
+      nitems_step = gimple_build (preheader_seq, MULT_EXPR, step_type,
+                                 step, step_factor);
+    }
+
+  /* Provide a definition of each control in the group.  */
+  tree ctrl;
+  unsigned int i;
+  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+    {
+      tree zero_val = build_zero_cst (step_type);
+      tree new_ctrl = vect_gen_while (preheader_seq, ctrl_type, zero_val,
+                                     nitems_step, "loop_mask");
+
+      gassign *assign = gimple_build_assign (ctrl, new_ctrl);
+      gimple_seq_add_stmt (preheader_seq, assign);
+    }
+}
 
 /* Like vect_set_loop_condition, but handle the case in which the vector
    loop handles exactly VF scalars per iteration.  */
 
 static gcond *
-vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
                                class loop *loop, tree niters, tree step,
                                tree final_iv, bool niters_maybe_zero,
                                gimple_stmt_iterator loop_cond_gsi)
@@ -1345,6 +1382,21 @@ vect_set_loop_condition_normal (loop_vec_info /* 
loop_vinfo */, edge exit_edge,
     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
                                       limit, step);
 
+  if (use_capped_vf(loop_vinfo))
+    {
+      gimple_seq preheader_seq = NULL;
+
+      unsigned int i;
+      rgroup_controls *rgc;
+      auto_vec<rgroup_controls> *controls
+       = &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec;
+
+      FOR_EACH_VEC_ELT (*controls, i, rgc)
+       if (!rgc->controls.is_empty ())
+         vect_set_loop_invariant_group_controls (&preheader_seq, rgc, step);
+      add_preheader_seq (loop, preheader_seq);
+    }
+
   if (final_iv)
     {
       gassign *assign;
@@ -1398,7 +1450,9 @@ vect_set_loop_condition (class loop *loop, edge loop_e, 
loop_vec_info loop_vinfo
   gcond *orig_cond = get_loop_exit_condition (loop_e);
   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 
-  if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+  if (loop_vinfo
+      && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && !use_capped_vf (loop_vinfo))
     {
       if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == 
vect_partial_vectors_avx512)
        cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, 
loop_e,
@@ -2844,7 +2898,20 @@ vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, 
tree niters,
   else
     {
       niters_vector = ni_minus_gap;
-      step_vector = build_int_cst (type, vf);
+      // If this is a capped loop, build the step_vector
+      if (use_capped_vf (loop_vinfo))
+       {
+         step_vector = fold_build2
+           (MIN_EXPR, type,
+            build_int_cst_type (type, LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo)),
+            build_int_cst_type (type, vf));
+         gimple_seq stmts = NULL;
+         step_vector = force_gimple_operand (step_vector, &stmts, true, 
NULL_TREE);
+         loop_vinfo ->runtime_vf = step_vector;
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+       }
+      else
+       step_vector = build_int_cst (type, vf);
     }
 
   if (!is_gimple_val (niters_vector))
@@ -3161,6 +3228,9 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
     bound_epilog += vf - 1;
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && use_capped_vf(loop_vinfo))
+    bound_epilog += LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo);
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     bound_epilog += 1;
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index cd90df2f0c0..99d97ac0530 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -995,6 +995,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     th (0),
     versioning_threshold (0),
     vectorization_factor (0),
+    runtime_vf_cap (MAX_VECTORIZATION_FACTOR),
     main_loop_edge (nullptr),
     skip_main_loop_edge (nullptr),
     skip_this_loop_edge (nullptr),
@@ -2559,7 +2560,17 @@ start_over:
                   LOOP_VINFO_INT_NITERS (loop_vinfo));
     }
 
+  /* Need to check if we can cap vectorization here? */
   if (max_vf != MAX_VECTORIZATION_FACTOR
+      && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && max_vf < LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo))
+    {
+      LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+      LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo) = max_vf;
+    }
+
+  if (max_vf != MAX_VECTORIZATION_FACTOR
+      && max_vf < LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo)
       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
     return opt_result::failure_at (vect_location, "bad data dependence.\n");
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 26a0850a19d..be54ee582ee 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "target.h"
 #include "rtl.h"
+#include "tree-core.h"
 #include "tree.h"
 #include "gimple.h"
 #include "ssa.h"
@@ -3240,6 +3241,17 @@ vect_get_data_ptr_increment (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
                                                     memory_access_type);
 
   tree iv_step = TYPE_SIZE_UNIT (aggr_type);
+  if (loop_vinfo && use_capped_vf (loop_vinfo))
+    {
+      tree type = TREE_TYPE (loop_vinfo->runtime_vf);
+      iv_step
+       = fold_build2 (MULT_EXPR, type,
+                      fold_build1 (NOP_EXPR, type,
+                                   TYPE_SIZE_UNIT (TREE_TYPE (aggr_type))),
+                      loop_vinfo->runtime_vf);
+      return iv_step;
+    }
+
   tree step = vect_dr_behavior (vinfo, dr_info)->step;
   if (tree_int_cst_sgn (step) == -1)
     iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f4e17840061..f29b5672aec 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef GCC_TREE_VECTORIZER_H
 #define GCC_TREE_VECTORIZER_H
 
+#include "is-a.h"
 typedef class _stmt_vec_info *stmt_vec_info;
 typedef struct _slp_tree *slp_tree;
 
@@ -779,6 +780,12 @@ public:
   /* Unrolling factor  */
   poly_uint64 vectorization_factor;
 
+  /* Runtime SSA node for VF factor  */
+  tree runtime_vf;
+
+  /* Runtime vf cap for iterations, for VLA modes only.  ADD MORE DETAIL*/
+  unsigned int runtime_vf_cap;
+
   /* If this loop is an epilogue loop whose main loop can be skipped,
      MAIN_LOOP_EDGE is the edge from the main loop to this loop's
      preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
@@ -1092,6 +1099,7 @@ public:
 #define LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT(L) (L)->allow_mutual_alignment
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
 #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
+#define LOOP_VINFO_RUNTIME_VF_CAP(L)      (L)->runtime_vf_cap
 #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_LENS(L)                 (L)->lens
@@ -2210,6 +2218,19 @@ vect_use_loop_mask_for_alignment_p (loop_vec_info 
loop_vinfo)
               && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)));
 }
 
+/* Returns true if the loop is usa vectorization factor that is capped at
+   run time. */
+
+static inline bool
+use_capped_vf (vec_info *vinfo)
+{
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
+  return (loop_vinfo
+         && LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo) != MAX_VECTORIZATION_FACTOR
+         && maybe_lt (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo),
+                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
+}
+
 /* Return the number of vectors of type VECTYPE that are needed to get
    NUNITS elements.  NUNITS should be based on the vectorization factor,
    so it is always a known multiple of the number of elements in VECTYPE.  */
@@ -2287,7 +2308,9 @@ vect_update_max_nunits (poly_uint64 *max_nunits, tree 
vectype)
 inline unsigned int
 vect_vf_for_cost (loop_vec_info loop_vinfo)
 {
-  return estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  return std::min ((unsigned int) estimated_poly_value
+                    (LOOP_VINFO_VECT_FACTOR (loop_vinfo)),
+                  LOOP_VINFO_RUNTIME_VF_CAP (loop_vinfo));
 }
 
 /* Estimate the number of elements in VEC_TYPE for costing purposes.
-- 
2.34.1

Reply via email to