On Mon, 17 May 2021, Jiufu Guo wrote:

> When there is the possibility that overflow/wrap may happen on the loop index,
> a few optimizations would not happen. For example code:
> 
> foo (int *a, int *b, unsigned k, unsigned n)
> {
>   while (++k != n)
>     a[k] = b[k]  + 1;
> }
> 
> For this code, if "k > n", k would wrap.  if "k < n" at begining,
> it could be optimized (e.g. vectorization).
> 
> We can split the loop into two loops:
> 
>   while (++k > n)
>     a[k] = b[k]  + 1;
>   while (l++ < n)
>     a[k] = b[k]  + 1;
> 
> then for the second loop, it could be optimized.

Btw, I think even the first loop should be vectorized.  I see we do
not handle it in niter analysis:

Analyzing loop at t.c:3
t.c:3:14: note:  === analyze_loop_nest ===
t.c:3:14: note:   === vect_analyze_loop_form ===
t.c:3:14: note:    === get_loop_niters ===
t.c:3:14: missed:   not vectorized: number of iterations cannot be 
computed.

but the number of iterations should be UINT_MAX - k (unless I'm
missing sth), may_be_zero would be sth like k < n.  It would be
nice to not split this into loops that niter analysis cannot handle ...

> This patch is spltting this kind of small loop to achieve better performance.
> 
> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
> 
> Thanks!
> 
> Jiufu Guo.
> 
> gcc/ChangeLog:
> 
> 2021-05-15  Jiufu Guo  <guoji...@linux.ibm.com>
> 
>       * tree-ssa-loop-split.c (connect_loop_phis): Add new param.
>       (get_ne_cond_branch): New function.
>       (split_ne_loop): New function.
>       (split_loop_on_ne_cond): New function.
>       (tree_ssa_split_loops): Use split_loop_on_ne_cond.
> 
> gcc/testsuite/ChangeLog:
> 
> 2021-05-15  Jiufu Guo  <guoji...@linux.ibm.com>
> 
>       * gcc.dg/loop-split1.c: New test.
>       * g++.dg/vect/pr98064.cc: Suppress warning.
> ---
>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>  gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
>  gcc/tree-ssa-loop-split.c            | 188 ++++++++++++++++++++++++++-
>  3 files changed, 296 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
> 
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc 
> b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
>  // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> +   is optimized/analyzed as string operation,e.g. memset.  */
>  
>  const long long &min(const long long &__a, long long &__b) {
>    if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..30b006b1b14
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,108 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l]  + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void bar();
> +void foo4(unsigned n,  unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +        return;
> +      bar();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +foo5 (double *a, unsigned n, unsigned i)
> +{
> +  while (a[i] > 0 && i != n)
> +    i++;
> +
> +  return i;
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++,q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */

Please consider making the testcase execute ones, verifying computation
results.

> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..5c1742b5ff4 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
>  
>  /* This file implements two kinds of loop splitting.
>  
> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>     this.  The loops need to fulfill easy_exit_values().  */
>  

please document use_prev

>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +                bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop *loop2, 
> edge new_e)
>  
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
> +                UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
>  
> +/* Check if the LOOP exit branch likes "if (idx != bound)",

is like

> +   Return the branch edge which exit loop, if overflow/wrap
> +   may happen on "idx".  */

I think we only want to handle wrapping (thus not undefined overflow).

> +
> +static edge
> +get_ne_cond_branch (struct loop *loop)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      /* Check if there is possible wrap/overflow.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +     continue;
> +      if (niter.control.no_overflow)
> +     return NULL;
> +      if (niter.cmp != NE_EXPR)
> +     continue;
> +
> +      /* Check loop is simple to split.  */

it seems like the following and below condition mean "simple"
is either all code is before or after the exit in question,
please improve the comment to explain the two cases.

> +      if (single_pred_p (loop->latch)
> +       && single_pred_edge (loop->latch)->src == e->src
> +       && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))

split_loop uses empty_block_p (loop->latch).

> +     return e;
> +
> +      /* Simple header.  */
> +      if (e->src == loop->header)
> +     {
> +       if (get_virtual_phi (e->src))
> +         continue;

So this disqualifies all loops which store to memory?

> +
> +       /* Only one phi.  */
> +       gphi_iterator psi = gsi_start_phis (e->src);
> +       if (gsi_end_p (psi))
> +         continue;
> +       gsi_next (&psi);
> +       if (!gsi_end_p (psi))
> +         continue;
> +
> +       /* ++i or ++i */
> +       gimple_stmt_iterator gsi = gsi_start_bb (e->src);

I think you want gsi_start_nondebug_after_labels_bb (e->src)

> +
> +       gimple *gc = last_stmt (e->src);
> +       tree idx = gimple_cond_lhs (gc);

you have to check the last stmt is a GIMPLE_COND, we have
recorded exits that exit on EH for example.

> +       if (expr_invariant_in_loop_p (loop, idx))
> +         idx = gimple_cond_rhs (gc);
> +
> +       gimple *s1 = gsi_stmt (gsi);
> +       if (!(is_gimple_assign (s1) && idx
> +             && (idx == gimple_assign_lhs (s1)
> +                 || idx == gimple_assign_rhs1 (s1))))
> +         continue;

I think you want to allow a IV PHI plus an IV increment
in the block and nothing else.  The pattern matching isn't
exact though and could be fooled by, say

 header:
   i_2 = PHI <i_5(D), i_3(latch)>
   _3 = i_2 + 5;
   if (i_2 != n_4(D))
     goto exit;

 latch:
   i_3 = i_2 + 1;
   foo (_3);
   goto header;

that is, the increment you matched is not the new iterator value
but instead a derived value used in a stmt in the latch.  So
I think if you really cannot allow any code besides the IV
increment then you need to actually get the PHI node and
match the only real stmt with the definition of the PHIs
latch edge value.

Or relax all this, of course.

> +       gsi_next (&gsi);
> +       if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
> +         return e;
> +     }
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +                                  profile_probability::always (),
> +                                  profile_probability::never (),
> +                                  profile_probability::always (),
> +                                  profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> +  gimple_cond_set_code (gc, up_code);
> +  gimple_cond_set_code (dup_gc, down_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
> +                  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, down_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, 
> 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);
> +
> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> +   S;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +
> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
> +  free (bbs);
> +
> +  if (num > param_max_peeled_insns)
> +    return false;
> +
> +  edge branch_edge = get_ne_cond_branch (loop);
> +  if (branch_edge && split_ne_loop (loop, branch_edge))
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Main entry point.  Perform loop splitting on all suitable loops.  */
>  
>  static unsigned int
> @@ -1622,7 +1803,8 @@ tree_ssa_split_loops (void)
>        if (optimize_loop_for_size_p (loop))
>       continue;
>  
> -      if (split_loop (loop) || split_loop_on_cond (loop))
> +      if (split_loop (loop) || split_loop_on_cond (loop)
> +       || split_loop_on_ne_cond (loop))
>       {
>         /* Mark our containing loop as having had some split inner loops.  */
>         loop_outer (loop)->aux = loop;
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

Reply via email to