[PR middle-end/70359] uncoalesce IVs outside of loops

Aldy Hernandez Mon, 19 Mar 2018 10:08:43 -0700

Hi Richard.

As discussed in the PR, the problem here is that we have two differentiterations of an IV live outside of a loop. This inhibits us from usingautoinc/dec addressing on ARM, and causes extra lea's on x86.


An abbreviated example is this:

loop:
  # p_9 = PHI <p_17(2), p_20(3)>
  p_20 = p_9 + 18446744073709551615;
goto loop
  p_24 = p_9 + 18446744073709551614;
  MEM[(char *)p_20 + -1B] = 45;

Here we have both the previous IV (p_9) and the current IV (p_20) usedoutside of the loop. On Arm this keeps us from using auto-decaddressing, because one use is -2 and the other one is -1.

With the attached patch we attempt to rewrite out-of-loop uses of the IVin terms of the current/last IV (p_20 in the case above). With it, weend up with:


  p_24 = p_20 + 18446744073709551615;
  *p_24 = 45;

...which helps both x86 and Arm.

As you have suggested in comment 38 on the PR, I handle speciallyout-of-loop IV uses of the form IV+CST and propagate those accordingly(along with the MEM_REF above). Otherwise, in less specific cases, weun-do the IV increment, and use that value in all out-of-loop uses. Forinstance, in the attached testcase, we rewrite:


  george (p_9);

into

  _26 = p_20 + 1;
  ...
  george (_26);

The attached testcase tests the IV+CST specific case, as well as themore generic case with george().

Although the original PR was for ARM, this behavior can be noticed onx86, so I tested on x86 with a full bootstrap + tests. I also ran thespecific test on an x86 cross ARM build and made sure we had 2 auto-decwith the test. For the original test (slightly different than thetestcase in this patch), with this patch we are at 104 bytes versus 116without it. There is still the issue of a division optimization whichwould further reduce the code size. I will discuss this separately asit is independent from this patch.

Oh yeah, we could make this more generic, and maybe handle any multipleof the constant, or perhaps *= and /=. Perhaps something for next stage1...


OK for trunk?
Aldy

gcc/

	PR middle-end/70359
	* tree-outof-ssa.c (uncoalesce_ivs_outside_loop): New.
	(is_iv_plus_constant): New.
	(maybe_optimize_iv_plus_constant): New.
	(undo_iv_increment): New.

diff --git a/gcc/testsuite/gcc.dg/pr70359.c b/gcc/testsuite/gcc.dg/pr70359.c
new file mode 100644
index 00000000000..b903548e45d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr70359.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -fdump-rtl-expand-details" } */
+
+/* Test uncoalesce_ivs_outside_loop().  */
+
+void george (const char *);
+
+char* inttostr(int i, char* buf, int len)
+{
+  unsigned int ui = (i > 0) ? i : -i;
+  char *p = buf + len - 1;
+  *p = '\0';
+  do {
+    *--p = '0' + (ui % 10);
+  } while ((ui /= 10) != 0);
+  if (i < 0) {
+    /* p+1 is the IV in the loop.  This line should trigger un-doing
+       the increment and cause this: */
+    /* { dg-final { scan-rtl-dump-times "Adjusted one out of loop IV" 1 "expand" } } */
+    george (p+1);
+
+    *--p = '-';
+  }
+  return p;
+}
+
+/* At the last gimple dump we have:
+
+     p_22 = p_8 + 4294967294;
+     MEM[(char *)p_19 + 4294967295B] = 45;
+
+   The above should be converted by RTL expansion time into:
+
+     p_22 = p_19 + 4294967295;
+     *p_22 = 45;
+*/
+
+/* { dg-final { scan-rtl-dump "p_\[0-9\]+ = p_\[0-9\]+ \\+ \[0-9\]+;\[\n\r \]+\\*p_\[0-9\]+ = 45;" "expand" } } */
+
+/* On ARM we should get two post-increment stores.  */
+/* { dg-final { scan-assembler-times "str\[^\n\r]+!" 2 { target arm-*-* } } }  */
+
+/* On x86 we should not get more than two lea's.  */
+/* { dg-final { scan-assembler-times "lea.\t" 2 { target i?86-*-* x86_64-*-* } } } */
diff --git a/gcc/tree-outof-ssa.c b/gcc/tree-outof-ssa.c
index 59bdcd6fadd..42cc53d7052 100644
--- a/gcc/tree-outof-ssa.c
+++ b/gcc/tree-outof-ssa.c
@@ -43,6 +43,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-coalesce.h"
 #include "tree-outof-ssa.h"
 #include "dojump.h"
+#include "fold-const.h"
+#include "tree-ssa-loop-niter.h"
+#include "cfgloop.h"
 
 /* FIXME: A lot of code here deals with expanding to RTL.  All that code
    should be in cfgexpand.c.  */
@@ -1035,6 +1038,221 @@ trivially_conflicts_p (basic_block bb, tree result, tree arg)
   return false;
 }
 
+/* Return TRUE if STMT is in the form: x_99 = SSA +p INT.  */
+static bool
+is_iv_plus_constant (gimple *stmt, tree ssa)
+{
+  return is_gimple_assign (stmt)
+    && gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR
+    && gimple_assign_rhs1 (stmt) == ssa
+    && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST;
+}
+
+/* Helper function for uncoalesce_ivs_outside_loop.  Optimize out of
+   loop uses of the IV when they are of the form IV+CST.
+
+   Return TRUE if we were able to rewrite:
+      iv_incr = iv + CST
+      goto LOOP;
+      iv_use = iv + 2*CST
+    into:
+      iv_use = iv_incr + CST
+
+   IV is the induction variable for the current iteration.
+   IV_INCR is the IV value for the next iteration.
+   IV_USE is the statement that uses the IV outside of the loop.  */
+
+static bool
+maybe_optimize_iv_plus_constant (tree iv, tree iv_incr, gimple *iv_use)
+{
+ gimple *incr_stmt = SSA_NAME_DEF_STMT (iv_incr);
+ tree cst = gimple_assign_rhs2 (incr_stmt);
+ tree cst_times_2 = fold_build2 (PLUS_EXPR, TREE_TYPE (cst),
+				 cst, cst);
+
+ /* Handle simple IV uses of the form "iv + 2*CST".  */
+ if (!is_iv_plus_constant (iv_use, iv)
+     || gimple_assign_rhs2 (iv_use) != cst_times_2)
+   return false;
+
+ /* Rewrite:
+      iv_incr = iv + CST
+      goto LOOP;
+      iv_use = iv + 2*CST
+    into:
+      iv_use = iv_incr + CST
+ */
+ gimple_assign_set_rhs1 (iv_use, iv_incr);
+ gimple_assign_set_rhs2 (iv_use, cst);
+ update_stmt (iv_use);
+
+ /* It would be nice if the RTL optimizers could now forward:
+      iv_use = iv_incr + CST
+      MEM[iv_incr + CST]
+    into:
+      MEM[iv_use]
+
+    For now, let's just forward it ourselves.  */
+ gimple *u;
+ tree iv_use_var = gimple_assign_lhs (iv_use);
+ imm_use_iterator imm_iter;
+ FOR_EACH_IMM_USE_STMT (u, imm_iter, iv_incr)
+   {
+     if (!is_gimple_assign (u)
+	 || TREE_CODE (gimple_assign_lhs (u)) != MEM_REF)
+       continue;
+     tree memref = gimple_assign_lhs (u);
+     if (TREE_OPERAND (memref, 0) == iv_incr
+	 && TREE_CODE (TREE_OPERAND (memref, 1)) == INTEGER_CST
+	 && tree_int_cst_equal (TREE_OPERAND (memref, 1), cst))
+       {
+	 TREE_OPERAND (memref, 0) = iv_use_var;
+	 TREE_OPERAND (memref, 1)
+	   = build_int_cst (TREE_TYPE (TREE_OPERAND (memref, 1)), 0);
+	 update_stmt (u);
+       }
+   }
+ return true;
+}
+
+/* Undo an increment to an IV and return a temporary with the undone
+   value.
+
+   IV_INCR is the SSA name holding the result of the increment.
+   INCR_LOOP is the loop where IV_INCR is in.
+   BACKEDGE is the loop's back-edge.  */
+
+static tree
+undo_iv_increment (tree iv_incr, struct loop *incr_loop, edge backedge)
+{
+  gimple *incr_stmt = SSA_NAME_DEF_STMT (iv_incr);
+  tree cst = gimple_assign_rhs2 (incr_stmt);
+  tree inv_cst = fold_build1 (NEGATE_EXPR, TREE_TYPE (cst), cst);
+
+  /* Insert a statement undoing the increment right on the non
+     back-edge:
+       tmp = iv_incr + (-CST).  */
+  if (EDGE_COUNT (incr_loop->latch->succs) != 2)
+    {
+      /* Should we consider handling > 2 edges and creating a
+	 temporary along with the corresponding PHI node?  */
+      return NULL;
+    }
+  edge fallthru = NULL;
+  edge_iterator ei;
+  FOR_EACH_EDGE (fallthru, ei, incr_loop->latch->succs)
+    if (fallthru != backedge)
+      break;
+  gcc_assert (fallthru && fallthru != backedge);
+  gimple_stmt_iterator gsi;
+  gsi = gsi_last_bb (incr_loop->latch);
+  tree tmp = make_ssa_name (TREE_TYPE (iv_incr), NULL);
+  gassign *g = gimple_build_assign (tmp, POINTER_PLUS_EXPR, iv_incr, inv_cst);
+  gsi_insert_on_edge_immediate (fallthru, g);
+  return tmp;
+}
+
+/* If there is an out of loop use of the IV, see if we can rewrite it
+   in terms of the calculated increment.
+
+   This undoes some of the damage done by forwprop creating
+   overlapping life-ranges for before and after values of an IV.
+   Undoing this can yield better auto-inc addressing.
+
+   We are looking for loops like this:
+
+   LOOP:
+     # IV = PHI <p_16(2), IV_INCR(3)>
+     ...
+     IV_INCR = IV + CST;
+     goto LOOP;
+     IV_USE = IV + CST*2;
+
+   The outside use of IV can be rewritten as:
+     IV_USE = IV_INCR + CST;
+
+   IV is the induction variable.
+   IV_INCR is the increment variable.
+   BACKEDGE is the loop's back-edge.  */
+
+static void
+uncoalesce_ivs_outside_loop (tree iv, tree iv_incr, edge backedge)
+{
+  if (TREE_CODE (iv_incr) != SSA_NAME)
+    return;
+  gimple *incr_stmt = SSA_NAME_DEF_STMT (iv_incr);
+  basic_block incr_bb = gimple_bb (incr_stmt);
+  if (!incr_bb)
+    return;
+  struct loop *incr_loop = incr_bb->loop_father;
+  if (!incr_loop
+      || !is_iv_plus_constant (incr_stmt, iv))
+    return;
+
+  gimple *iv_use;
+  imm_use_iterator imm_iter;
+  FOR_EACH_IMM_USE_STMT (iv_use, imm_iter, iv)
+    {
+      if (is_gimple_debug (iv_use)
+	  || iv_use == incr_stmt
+	  || !stmt_dominates_stmt_p (incr_stmt, iv_use)
+	  /* Only consider IV uses outside of loop.  */
+	  || flow_bb_inside_loop_p (incr_loop, gimple_bb (iv_use)))
+	continue;
+
+      /* See if we can special case uses in the form IV+CST.  */
+      if (!maybe_optimize_iv_plus_constant (iv, iv_incr, iv_use))
+	{
+	  /* Otherwise, replace all out of loop uses of the IV in
+	     terms of a temporary thas has undone the increment
+
+	     For instance.  Replace:
+	       iv_incr = iv + CST
+	       goto LOOP;
+	       iv_use = iv + N*CST
+	     with:
+	       ...
+	       tmp = iv_incr - CST
+	       iv_use = tmp + N*CST
+	  */
+	  tree iv_adjust = NULL;
+	  if (!iv_adjust)
+	    {
+	      iv_adjust = undo_iv_increment (iv_incr, incr_loop, backedge);
+	      if (!iv_adjust)
+		continue;
+	    }
+	  use_operand_p use_p;
+	  bool update = false;
+	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+	    {
+	      gimple *use_stmt = USE_STMT (use_p);
+	      if (flow_bb_inside_loop_p (incr_loop,
+					 gimple_bb (use_stmt)))
+		continue;
+	      /* Make sure any adjustment uses we create come via the
+		 adjustment statement, else we'd create a use without
+		 a def.  */
+	      if (!dominated_by_p (CDI_DOMINATORS,
+				   gimple_bb (use_stmt),
+				   gimple_bb (SSA_NAME_DEF_STMT (iv_adjust))))
+		continue;
+	      SET_USE (use_p, iv_adjust);
+	      update = true;
+	    }
+	  if (update)
+	    {
+	      update_stmt (iv_use);
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "Adjusted one out of loop IV: ");
+		  print_generic_stmt (dump_file, iv);
+		  fprintf (dump_file, "\n");
+		}
+	    }
+	}
+    }
+}
 
 /* Search every PHI node for arguments associated with backedges which
    we can trivially determine will need a copy (the argument is either
@@ -1090,6 +1308,8 @@ insert_backedge_copies (void)
 		  if (!gsi_end_p (gsi2))
 		    last = gsi_stmt (gsi2);
 
+		  uncoalesce_ivs_outside_loop (result, arg, e);
+
 		  /* In theory the only way we ought to get back to the
 		     start of a loop should be with a COND_EXPR or GOTO_EXPR.
 		     However, better safe than sorry.
@@ -1156,6 +1376,12 @@ finish_out_of_ssa (struct ssaexpand *sa)
 unsigned int
 rewrite_out_of_ssa (struct ssaexpand *sa)
 {
+  /* Note: Any changes to the IL we perform here won't show up in the
+     .optimized tree dump, but in the RTL expand dump.  It is a bit
+     confusing for the gimple IL to change after the last gimple
+     pass/dump.  Perhaps we could move insert_backedge_copies() into
+     uncprop during the next stage1.  */
+
   /* If elimination of a PHI requires inserting a copy on a backedge,
      then we will have to split the backedge which has numerous
      undesirable performance effects.
@@ -1164,7 +1390,6 @@ rewrite_out_of_ssa (struct ssaexpand *sa)
      copies into the loop itself.  */
   insert_backedge_copies ();
 
-
   /* Eliminate PHIs which are of no use, such as virtual or dead phis.  */
   eliminate_useless_phis ();

[PR middle-end/70359] uncoalesce IVs outside of loops

Reply via email to