Re: [RFC][PR88838][SVE] Use 32-bit WHILELO in LP64 mode

Kugan Vivekanandarajah Mon, 27 May 2019 19:34:52 -0700

Hi Richard,

Thanks for the review.


On Sat, 25 May 2019 at 19:41, Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> writes:
> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > index 77d3dac..d6452a1 100644
> > --- a/gcc/tree-vect-loop-manip.c
> > +++ b/gcc/tree-vect-loop-manip.c
> > @@ -418,7 +418,20 @@ vect_set_loop_masks_directly (struct loop *loop, 
> > loop_vec_info loop_vinfo,
> >    tree mask_type = rgm->mask_type;
> >    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
> >    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
> > -
> > +  bool convert = false;
> > +  tree iv_type = NULL_TREE;
> > +
> > +  /* If the compare_type is not with Pmode size, we will create an IV with
> > +     Pmode size with truncated use (i.e. converted to the correct type).
> > +     This is because using Pmode allows ivopts to reuse the IV for indices
> > +     (in the loads and store).  */
> > +  if (known_lt (GET_MODE_BITSIZE (TYPE_MODE (compare_type)),
> > +             GET_MODE_BITSIZE (Pmode)))
> > +    {
> > +      iv_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),
> > +                                             true);
> > +      convert = true;
> > +    }
> >    /* Calculate the maximum number of scalar values that the rgroup
> >       handles in total, the number that it handles for each iteration
> >       of the vector loop, and the number that it should skip during the
> > @@ -444,12 +457,43 @@ vect_set_loop_masks_directly (struct loop *loop, 
> > loop_vec_info loop_vinfo,
> >       processed.  */
> >    tree index_before_incr, index_after_incr;
> >    gimple_stmt_iterator incr_gsi;
> > +  gimple_stmt_iterator incr_gsi2;
> >    bool insert_after;
> > -  tree zero_index = build_int_cst (compare_type, 0);
> > +  tree zero_index;
> >    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
> > -          insert_after, &index_before_incr, &index_after_incr);
> >
> > +  if (convert)
> > +    {
> > +      /* If we are creating IV of Pmode type and converting.  */
> > +      zero_index = build_int_cst (iv_type, 0);
> > +      tree step = build_int_cst (iv_type,
> > +                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
> > +      /* Creating IV of Pmode type.  */
> > +      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
> > +              insert_after, &index_before_incr, &index_after_incr);
> > +      /* Create truncated index_before and after increament.  */
> > +      tree index_before_incr_trunc = make_ssa_name (compare_type);
> > +      tree index_after_incr_trunc = make_ssa_name (compare_type);
> > +      gimple *incr_before_stmt = gimple_build_assign 
> > (index_before_incr_trunc,
> > +                                                   NOP_EXPR,
> > +                                                   index_before_incr);
> > +      gimple *incr_after_stmt = gimple_build_assign 
> > (index_after_incr_trunc,
> > +                                                  NOP_EXPR,
> > +                                                  index_after_incr);
> > +      incr_gsi2 = incr_gsi;
> > +      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);
> > +      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);
> > +      index_before_incr = index_before_incr_trunc;
> > +      index_after_incr = index_after_incr_trunc;
> > +      zero_index = build_int_cst (compare_type, 0);
> > +    }
> > +  else
> > +    {
> > +      /* If the IV is of Pmode compare_type, no convertion needed.  */
> > +      zero_index = build_int_cst (compare_type, 0);
> > +      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
> > +              insert_after, &index_before_incr, &index_after_incr);
> > +    }
> >    tree test_index, test_limit, first_limit;
> >    gimple_stmt_iterator *test_gsi;
> >    if (might_wrap_p)
>
> Instead of hard-coding Pmode as a special case here, I think we should
> record the IV type in vect_verify_full_masking in addition to the comparison
> type.  (With the IV type always being at least as wide as the comparison
> type.)
Ok.

>
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index bd81193..2769c86 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -1035,6 +1035,30 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
> >    /* Find a scalar mode for which WHILE_ULT is supported.  */
> >    opt_scalar_int_mode cmp_mode_iter;
> >    tree cmp_type = NULL_TREE;
> > +  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
> > +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
> > +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
> > +  widest_int iv_limit;
> > +  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
> > +  if (known_max_iters)
> > +    {
> > +      if (niters_skip)
> > +     {
> > +       /* Add the maximum number of skipped iterations to the
> > +          maximum iteration count.  */
> > +       if (TREE_CODE (niters_skip) == INTEGER_CST)
> > +         iv_limit += wi::to_widest (niters_skip);
> > +       else
> > +         iv_limit += max_vf - 1;
> > +     }
> > +      /* IV_LIMIT is the maximum number of latch iterations, which is also
> > +      the maximum in-range IV value.  Round this value down to the previous
> > +      vector alignment boundary and then add an extra full iteration.  */
> > +      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > +      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
> > +    }
> > +
> > +  /* Get the vectorization factor in tree form.  */
>
> Please split the loop-manip.c bits you need out into a subroutine instead
> of cut-&-pasting. :-)
Ok.

>
> >    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
> >      {
> >        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
> > @@ -1045,12 +1069,23 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
> >         if (this_type
> >             && can_produce_all_loop_masks_p (loop_vinfo, this_type))
> >           {
> > +           /* See whether zero-based IV would ever generate all-false masks
> > +              before wrapping around.  */
> > +           bool might_wrap_p
> > +             = (!known_max_iters
> > +                || (wi::min_precision
> > +                    (iv_limit
> > +                     * vect_get_max_nscalars_per_iter (loop_vinfo),
> > +                     UNSIGNED) > cmp_bits));
>
> The wi::min_precision is invariant, so there's no need to calculate it in
> each iteration of the loop.  Would be good to avoid the duplicated call to
> vect_get_max_nscalars_per_iter (also called for min_ni_width).
Ok.

>
> >             /* Although we could stop as soon as we find a valid mode,
> >                it's often better to continue until we hit Pmode, since the
> >                operands to the WHILE are more likely to be reusable in
> > -              address calculations.  */
> > +              address calculations.  Unless the limit is extended from
> > +              this_type.  */
>
> Please rewrite the comment to describe the new code rather than tacking
> this kind of thing on the end.
Ok.

>
> >             cmp_type = this_type;
> > -           if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
> > +           if (cmp_bits >= GET_MODE_BITSIZE (Pmode)
> > +               || (!might_wrap_p
> > +                   && (cmp_bits == TYPE_PRECISION (niters_type))))
>
> The TYPE_PRECISION test looks redundant.  E.g. if a loop only executes
> N times, all we care about is whether the IV we pick can handle N
> iterations without wrapping.  It doesn't really matter what type the
> original source code used to hold the loop count.
Attached patch is revised based on your comments. Does this look better.

Thanks,
Kugan
>
> Thanks,
> Richard

From cbf18e0a3a7e34d2f3caefe1b5e0e6a5cce2e2b5 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Tue, 28 May 2019 11:57:54 +1000
Subject: [PATCH] PR88838 V3

Change-Id: I2e614e6928d0b8bf32f8470ea80c76f8a4562f78
---
 gcc/fwprop.c                                   |  13 +++
 gcc/testsuite/gcc.target/aarch64/pr88838.c     |  11 +++
 gcc/testsuite/gcc.target/aarch64/sve/while_1.c |  16 ++--
 gcc/tree-vect-loop-manip.c                     | 110 ++++++++++++++++++-------
 gcc/tree-vect-loop.c                           |  29 ++++++-
 gcc/tree-vectorizer.h                          |   7 ++
 6 files changed, 143 insertions(+), 43 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr88838.c

diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index cf2c9de..5275ad3 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -1358,6 +1358,19 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
   else
     mode = GET_MODE (*loc);
 
+  /* TODO.  */
+  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))
+    return false;
+  /* TODO. We can't get the mode for
+     (set (reg:VNx16BI 109)
+          (unspec:VNx16BI [
+	    (reg:SI 131)
+	    (reg:SI 106)
+           ] UNSPEC_WHILE_LO))
+     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */
+  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg))
+      && GET_CODE (SET_SRC (use_set)) == UNSPEC)
+    return false;
   new_rtx = propagate_rtx (*loc, mode, reg, src,
   			   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c
new file mode 100644
index 0000000..d7db847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+    for (int i = 0; i < n; i += 1)
+          x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not "sxtw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
index a93a04b..05a4860 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -26,14 +26,14 @@
 TEST_ALL (ADD_LOOP)
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index b3fae5b..c15b8a2 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -415,10 +415,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 			      bool might_wrap_p)
 {
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
   tree mask_type = rgm->mask_type;
   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+  bool convert = false;
 
+  /* If the compare_type is not iv_type, we will create an IV with
+     iv_type with truncated use (i.e. converted to the correct type).  */
+  if (compare_type != iv_type)
+    convert = true;
   /* Calculate the maximum number of scalar values that the rgroup
      handles in total, the number that it handles for each iteration
      of the vector loop, and the number that it should skip during the
@@ -444,12 +450,43 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
      processed.  */
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
+  gimple_stmt_iterator incr_gsi2;
   bool insert_after;
-  tree zero_index = build_int_cst (compare_type, 0);
+  tree zero_index;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
-	     insert_after, &index_before_incr, &index_after_incr);
 
+  if (convert)
+    {
+      /* If we are creating IV of iv_type and then converting.  */
+      zero_index = build_int_cst (iv_type, 0);
+      tree step = build_int_cst (iv_type,
+				 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+      /* Creating IV of iv_type.  */
+      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+      /* Create truncated index_before and after increament.  */
+      tree index_before_incr_trunc = make_ssa_name (compare_type);
+      tree index_after_incr_trunc = make_ssa_name (compare_type);
+      gimple *incr_before_stmt = gimple_build_assign (index_before_incr_trunc,
+						      NOP_EXPR,
+						      index_before_incr);
+      gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc,
+						     NOP_EXPR,
+						     index_after_incr);
+      incr_gsi2 = incr_gsi;
+      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);
+      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);
+      index_before_incr = index_before_incr_trunc;
+      index_after_incr = index_after_incr_trunc;
+      zero_index = build_int_cst (compare_type, 0);
+    }
+  else
+    {
+      /* If the IV is of compare_type, no convertion needed.  */
+      zero_index = build_int_cst (compare_type, 0);
+      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+    }
   tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
@@ -617,6 +654,41 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   return next_mask;
 }
 
+/* Return the iv_limit for fully masked loop LOOP with LOOP_VINFO.
+   If it is not possible to calcilate iv_limit, return -1.  */
+widest_int
+vect_get_loop_iv_limit (struct loop *loop, loop_vec_info loop_vinfo)
+{
+  /* Convert skip_niters to the right type.  */
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Now calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
+  if (known_max_iters)
+    {
+      if (niters_skip)
+	{
+	  /* Add the maximum number of skipped iterations to the
+	     maximum iteration count.  */
+	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+	    iv_limit += wi::to_widest (niters_skip);
+	  else
+	    iv_limit += max_vf - 1;
+	}
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+	 the maximum in-range IV value.  Round this value down to the previous
+	 vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+
 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
    number of iterations of the original scalar loop that should be
@@ -637,12 +709,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
-  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
   tree orig_niters = niters;
 
   /* Type of the initial value of NITERS.  */
   tree ni_actual_type = TREE_TYPE (niters);
   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 
   /* Convert NITERS to the same size as the compare.  */
   if (compare_precision > ni_actual_precision
@@ -661,33 +733,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
   else
     niters = gimple_convert (&preheader_seq, compare_type, niters);
 
-  /* Convert skip_niters to the right type.  */
-  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-
-  /* Now calculate the value that the induction variable must be able
-     to hit in order to ensure that we end the loop with an all-false mask.
-     This involves adding the maximum number of inactive trailing scalar
-     iterations.  */
-  widest_int iv_limit;
-  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
-  if (known_max_iters)
-    {
-      if (niters_skip)
-	{
-	  /* Add the maximum number of skipped iterations to the
-	     maximum iteration count.  */
-	  if (TREE_CODE (niters_skip) == INTEGER_CST)
-	    iv_limit += wi::to_widest (niters_skip);
-	  else
-	    iv_limit += max_vf - 1;
-	}
-      /* IV_LIMIT is the maximum number of latch iterations, which is also
-	 the maximum in-range IV value.  Round this value down to the previous
-	 vector alignment boundary and then add an extra full iteration.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
-    }
-
+  widest_int iv_limit = vect_get_loop_iv_limit (loop, loop_vinfo);
   /* Get the vectorization factor in tree form.  */
   tree vf = build_int_cst (compare_type,
 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
@@ -717,7 +763,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 	/* See whether zero-based IV would ever generate all-false masks
 	   before wrapping around.  */
 	bool might_wrap_p
-	  = (!known_max_iters
+	  = (iv_limit == -1
 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 				    UNSIGNED)
 		 > compare_precision));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e1229a5..431025b 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1056,6 +1056,16 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_get_loop_iv_limit (loop, loop_vinfo);
+  widest_int iv_precision = -1;
+
+  if (iv_limit != -1)
+    iv_precision
+      = wi::min_precision (iv_limit
+			   * vect_get_max_nscalars_per_iter (loop_vinfo),
+			   UNSIGNED);
+
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1066,13 +1076,25 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 	  if (this_type
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
-	      /* Although we could stop as soon as we find a valid mode,
-		 it's often better to continue until we hit Pmode, since the
+	      /* See whether zero-based IV would ever generate all-false masks
+		 before wrapping around.  */
+	      bool might_wrap_p = (iv_limit == -1 || (iv_precision > cmp_bits));
+	      /* Stop as soon as we find a valid mode. If we decided to use
+		 cmp_type which is less than Pmode precision, it is often better
+		 to use iv_type corresponding to Pmode, since the
 		 operands to the WHILE are more likely to be reusable in
-		 address calculations.  */
+		 address calculations in this case.  */
 	      cmp_type = this_type;
+	      iv_type = this_type;
 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
 		break;
+	      if (!might_wrap_p)
+		{
+		  iv_type
+		    = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),
+						      true);
+		  break;
+		}
 	    }
 	}
     }
@@ -1081,6 +1103,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     return false;
 
   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
   return true;
 }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d5fd469..b5de051 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
      is false and vectorized loop otherwise.  */
   tree simd_if_cond;
 
+  /* Type of the IV to use in the WHILE_ULT call for fully-masked
+     loops.  */
+  tree iv_type;
+
   /* Unknown DRs according to which loop was peeled.  */
   struct dr_vec_info *unaligned_dr;
 
@@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
+#define LOOP_VINFO_MASK_IV_TYPE(L)         (L)->iv_type
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
@@ -1462,6 +1467,8 @@ extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (struct loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
+extern widest_int vect_get_loop_iv_limit (struct loop *loop,
+					  loop_vec_info loop_vinfo);
 
 /* In tree-vect-stmts.c.  */
 extern poly_uint64 current_vector_size;
-- 
2.7.4

Re: [RFC][PR88838][SVE] Use 32-bit WHILELO in LP64 mode

Reply via email to