Re: [RFC][PR88838][SVE] Use 32-bit WHILELO in LP64 mode

Kugan Vivekanandarajah Thu, 30 May 2019 20:22:33 -0700

Hi Richard,

Thanks for the review.


On Tue, 28 May 2019 at 20:44, Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> writes:
> > [...]
> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > index b3fae5b..c15b8a2 100644
> > --- a/gcc/tree-vect-loop-manip.c
> > +++ b/gcc/tree-vect-loop-manip.c
> > @@ -415,10 +415,16 @@ vect_set_loop_masks_directly (struct loop *loop, 
> > loop_vec_info loop_vinfo,
> >                             bool might_wrap_p)
> >  {
> >    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
> > +  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
> >    tree mask_type = rgm->mask_type;
> >    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
> >    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
> > +  bool convert = false;
> >
> > +  /* If the compare_type is not iv_type, we will create an IV with
> > +     iv_type with truncated use (i.e. converted to the correct type).  */
> > +  if (compare_type != iv_type)
> > +    convert = true;
> >    /* Calculate the maximum number of scalar values that the rgroup
> >       handles in total, the number that it handles for each iteration
> >       of the vector loop, and the number that it should skip during the
> > @@ -444,12 +450,43 @@ vect_set_loop_masks_directly (struct loop *loop, 
> > loop_vec_info loop_vinfo,
> >       processed.  */
> >    tree index_before_incr, index_after_incr;
> >    gimple_stmt_iterator incr_gsi;
> > +  gimple_stmt_iterator incr_gsi2;
> >    bool insert_after;
> > -  tree zero_index = build_int_cst (compare_type, 0);
> > +  tree zero_index;
> >    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
> > -          insert_after, &index_before_incr, &index_after_incr);
> >
> > +  if (convert)
> > +    {
> > +      /* If we are creating IV of iv_type and then converting.  */
> > +      zero_index = build_int_cst (iv_type, 0);
> > +      tree step = build_int_cst (iv_type,
> > +                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
> > +      /* Creating IV of iv_type.  */
> > +      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
> > +              insert_after, &index_before_incr, &index_after_incr);
> > +      /* Create truncated index_before and after increament.  */
> > +      tree index_before_incr_trunc = make_ssa_name (compare_type);
> > +      tree index_after_incr_trunc = make_ssa_name (compare_type);
> > +      gimple *incr_before_stmt = gimple_build_assign 
> > (index_before_incr_trunc,
> > +                                                   NOP_EXPR,
> > +                                                   index_before_incr);
> > +      gimple *incr_after_stmt = gimple_build_assign 
> > (index_after_incr_trunc,
> > +                                                  NOP_EXPR,
> > +                                                  index_after_incr);
> > +      incr_gsi2 = incr_gsi;
> > +      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);
> > +      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);
> > +      index_before_incr = index_before_incr_trunc;
> > +      index_after_incr = index_after_incr_trunc;
> > +      zero_index = build_int_cst (compare_type, 0);
> > +    }
> > +  else
> > +    {
> > +      /* If the IV is of compare_type, no convertion needed.  */
> > +      zero_index = build_int_cst (compare_type, 0);
> > +      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
> > +              insert_after, &index_before_incr, &index_after_incr);
> > +    }
> >    tree test_index, test_limit, first_limit;
> >    gimple_stmt_iterator *test_gsi;
> >    if (might_wrap_p)
>
> Now that we have an explicit iv_type, there shouldn't be any need to
> treat this as two special cases.  I think we should just convert the
> IV to the comparison type before passing it to the WHILE.

Changed it.
>
> > @@ -617,6 +654,41 @@ vect_set_loop_masks_directly (struct loop *loop, 
> > loop_vec_info loop_vinfo,
> >    return next_mask;
> >  }
> >
> > +/* Return the iv_limit for fully masked loop LOOP with LOOP_VINFO.
> > +   If it is not possible to calcilate iv_limit, return -1.  */
>
> Maybe:
>
> /* Decide whether it is possible to use a zero-based induction variable
>    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
>    return the value that the induction variable must be able to hold
>    in order to ensure that the loop ends with an all-false mask.
>    Return -1 otherwise.  */
>
> I think the function should go on in tree-vect-loop.c instead.

OK.
>
> > +widest_int
> > +vect_get_loop_iv_limit (struct loop *loop, loop_vec_info loop_vinfo)
>
> Maybe: vect_iv_limit_for_full_masking
>
> Probably worth dropping the "loop" parameter and getting it from
> LOOP_VINFO.
OK.

>
> > +
> > +  /* Convert skip_niters to the right type.  */
>
> Comment no longer applies.
>
> > +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
> > +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
> > +
> > +  /* Now calculate the value that the induction variable must be able
> > +     to hit in order to ensure that we end the loop with an all-false mask.
> > +     This involves adding the maximum number of inactive trailing scalar
> > +     iterations.  */
> > +  widest_int iv_limit = -1;
> > +  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
> > +  if (known_max_iters)
>
> No need for this temporary variable.
>
> > +    {
> > +      if (niters_skip)
> > +     {
> > +       /* Add the maximum number of skipped iterations to the
> > +          maximum iteration count.  */
> > +       if (TREE_CODE (niters_skip) == INTEGER_CST)
> > +         iv_limit += wi::to_widest (niters_skip);
> > +       else
> > +         iv_limit += max_vf - 1;
> > +     }
>
> Note that MASK_SKIP_NITERS isn't set at the point you call it
> for vect_set_loop_condition_masked.  I think we should have:
>
>     else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
>       /* Make a conservatively-correct assumption.  */
>       iv_limit += max_vf - 1;
OK.

>
> > +      /* IV_LIMIT is the maximum number of latch iterations, which is also
> > +      the maximum in-range IV value.  Round this value down to the previous
> > +      vector alignment boundary and then add an extra full iteration.  */
> > +      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > +      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
> > +    }
> > +  return iv_limit;
> > +}
> > +
> >  /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
> >     LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
> >     number of iterations of the original scalar loop that should be
> > [...]
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index e1229a5..431025b 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -1056,6 +1056,16 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
> >    /* Find a scalar mode for which WHILE_ULT is supported.  */
> >    opt_scalar_int_mode cmp_mode_iter;
> >    tree cmp_type = NULL_TREE;
> > +  tree iv_type = NULL_TREE;
> > +  widest_int iv_limit = vect_get_loop_iv_limit (loop, loop_vinfo);
> > +  widest_int iv_precision = -1;
>
> iv_precision should be unsigned int.  Setting it to UINT_MAX would
> simplify the later code.
OK.

>
> > +
> > +  if (iv_limit != -1)
> > +    iv_precision
> > +      = wi::min_precision (iv_limit
> > +                        * vect_get_max_nscalars_per_iter (loop_vinfo),
> > +                        UNSIGNED);
> > +
>
> Would be good to avoid the duplicated call to
> vect_get_max_nscalars_per_iter (also called for min_ni_width).
OK.

>
> >    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
> >      {
> >        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
> > @@ -1066,13 +1076,25 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
> >         if (this_type
> >             && can_produce_all_loop_masks_p (loop_vinfo, this_type))
> >           {
> > -           /* Although we could stop as soon as we find a valid mode,
> > -              it's often better to continue until we hit Pmode, since the
> > +           /* See whether zero-based IV would ever generate all-false masks
> > +              before wrapping around.  */
> > +           bool might_wrap_p = (iv_limit == -1 || (iv_precision > 
> > cmp_bits));
>
> With the above change, the iv_limit check would no longer be needed.
>
> > +           /* Stop as soon as we find a valid mode. If we decided to use
> > +              cmp_type which is less than Pmode precision, it is often 
> > better
> > +              to use iv_type corresponding to Pmode, since the
> >                operands to the WHILE are more likely to be reusable in
> > -              address calculations.  */
> > +              address calculations in this case.  */
> >             cmp_type = this_type;
> > +           iv_type = this_type;
> >             if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
> >               break;
> > +           if (!might_wrap_p)
> > +             {
> > +               iv_type
> > +                 = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > (Pmode),
> > +                                                   true);
> > +               break;
> > +             }
>
> I think the loop should break in the same place as before, with the
> iv_type being what used to be the cmp_type.  The new behaviour is that
> (for the new meaning of cmp_type) we keep the current cmp_type if its
> precision is already >= iv_precision.

OK.

Attached patch fixes the issues raised. Does this look better?

Thanks,
Kugan
>
> Thanks,
> Richard

From 4c6d5c2aaa1c7fef8773aabf3c6f5edb37c58b68 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Tue, 28 May 2019 11:57:54 +1000
Subject: [PATCH] PR88838 V4

Change-Id: Ica9561d88379f472e4ec4b96aab5c7e1752f2fcc
---
 gcc/testsuite/gcc.target/aarch64/pr88838.c     | 11 ++++
 gcc/testsuite/gcc.target/aarch64/sve/while_1.c | 16 +++---
 gcc/tree-vect-loop-manip.c                     | 52 +++++++------------
 gcc/tree-vect-loop.c                           | 71 ++++++++++++++++++++++++--
 gcc/tree-vectorizer.h                          |  6 +++
 5 files changed, 110 insertions(+), 46 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr88838.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c
new file mode 100644
index 0000000..d7db847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+    for (int i = 0; i < n; i += 1)
+          x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not "sxtw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
index a93a04b..05a4860 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -26,14 +26,14 @@
 TEST_ALL (ADD_LOOP)
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index b3fae5b..fa8e7f2 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 			      bool might_wrap_p)
 {
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
   tree mask_type = rgm->mask_type;
   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
@@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
-  tree zero_index = build_int_cst (compare_type, 0);
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+
+  tree zero_index = build_int_cst (iv_type, 0);
+  tree step = build_int_cst (iv_type,
+			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  /* Creating IV of iv_type.  */
+  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
 	     insert_after, &index_before_incr, &index_after_incr);
 
+  zero_index = build_int_cst (compare_type, 0);
   tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
@@ -609,8 +615,14 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 
       /* Get the mask value for the next iteration of the loop.  */
       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
-      gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
-      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
+      tree test_index_cmp_type = make_ssa_name (compare_type);
+      gimple *conv_stmt = gimple_build_assign (test_index_cmp_type,
+					       NOP_EXPR,
+					       test_index);
+      gsi_insert_before (test_gsi, conv_stmt, GSI_NEW_STMT);
+      gcall *call = vect_gen_while (next_mask, test_index_cmp_type,
+				    this_test_limit);
+      gsi_insert_after (test_gsi, call, GSI_SAME_STMT);
 
       vect_set_loop_mask (loop, mask, init_mask, next_mask);
     }
@@ -637,12 +649,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
-  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
   tree orig_niters = niters;
 
   /* Type of the initial value of NITERS.  */
   tree ni_actual_type = TREE_TYPE (niters);
   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 
   /* Convert NITERS to the same size as the compare.  */
   if (compare_precision > ni_actual_precision
@@ -661,33 +673,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
   else
     niters = gimple_convert (&preheader_seq, compare_type, niters);
 
-  /* Convert skip_niters to the right type.  */
-  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-
-  /* Now calculate the value that the induction variable must be able
-     to hit in order to ensure that we end the loop with an all-false mask.
-     This involves adding the maximum number of inactive trailing scalar
-     iterations.  */
-  widest_int iv_limit;
-  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
-  if (known_max_iters)
-    {
-      if (niters_skip)
-	{
-	  /* Add the maximum number of skipped iterations to the
-	     maximum iteration count.  */
-	  if (TREE_CODE (niters_skip) == INTEGER_CST)
-	    iv_limit += wi::to_widest (niters_skip);
-	  else
-	    iv_limit += max_vf - 1;
-	}
-      /* IV_LIMIT is the maximum number of latch iterations, which is also
-	 the maximum in-range IV value.  Round this value down to the previous
-	 vector alignment boundary and then add an extra full iteration.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
-    }
-
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
   /* Get the vectorization factor in tree form.  */
   tree vf = build_int_cst (compare_type,
 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
@@ -717,7 +703,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 	/* See whether zero-based IV would ever generate all-false masks
 	   before wrapping around.  */
 	bool might_wrap_p
-	  = (!known_max_iters
+	  = (iv_limit == UINT_MAX
 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 				    UNSIGNED)
 		 > compare_precision));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 4942c69..1240037 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1029,7 +1029,10 @@ static bool
 vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
   unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
 
   /* Use a normal loop if there are no statements that need masking.
      This only happens in rare degenerate cases: it means that the loop
@@ -1048,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     max_ni = wi::smin (max_ni, max_back_edges + 1);
 
   /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
 
   /* Work out how many bits we need to represent the limit.  */
   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1056,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_precision = UINT_MAX;
+
+  if (iv_limit != UINT_MAX)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+				      UNSIGNED);
+
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1066,11 +1077,18 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 	  if (this_type
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
-	      /* Although we could stop as soon as we find a valid mode,
-		 it's often better to continue until we hit Pmode, since the
+	      /* See whether zero-based IV would ever generate all-false masks
+		 before wrapping around.  */
+	      bool might_wrap_p = (iv_precision > cmp_bits);
+	      /* Stop as soon as we find a valid mode.  If we decided to use
+		 cmp_type which is less than Pmode precision, it is often better
+		 to use iv_type corresponding to Pmode, since the
 		 operands to the WHILE are more likely to be reusable in
-		 address calculations.  */
-	      cmp_type = this_type;
+		 address calculations in this case.  */
+	      iv_type = this_type;
+	      if (might_wrap_p
+		  || (cmp_bits <= TYPE_PRECISION (niters_type)))
+		cmp_type = this_type;
 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
 		break;
 	    }
@@ -1081,6 +1099,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     return false;
 
   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
   return true;
 }
 
@@ -9014,3 +9033,45 @@ optimize_mask_stores (struct loop *loop)
       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
     }
 }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Now calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+	{
+	  /* Add the maximum number of skipped iterations to the
+	     maximum iteration count.  */
+	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+	    iv_limit += wi::to_widest (niters_skip);
+	  else
+	    iv_limit += max_vf - 1;
+	}
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+	/* Make a conservatively-correct assumption.  */
+	iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+	 the maximum in-range IV value.  Round this value down to the previous
+	 vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4db30cc..eb0f21f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
      is false and vectorized loop otherwise.  */
   tree simd_if_cond;
 
+  /* Type of the IV to use in the WHILE_ULT call for fully-masked
+     loops.  */
+  tree iv_type;
+
   /* Unknown DRs according to which loop was peeled.  */
   struct dr_vec_info *unaligned_dr;
 
@@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
+#define LOOP_VINFO_MASK_IV_TYPE(L)         (L)->iv_type
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
@@ -1582,6 +1587,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
 /* FORNOW: Used in tree-parloops.c.  */
 extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
+extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
 /* Used in gimple-loop-interchange.c.  */
 extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 				  enum tree_code);
-- 
2.7.4

Re: [RFC][PR88838][SVE] Use 32-bit WHILELO in LP64 mode

Reply via email to