[PATCH] Modulo-scheduling improvements. Patch 1 of 2. - A status update

Revital1 Eres Thu, 12 Jul 2007 21:11:56 -0700

Hello,

I wanted to update the status of the first patch that
Vladimir had posted to improve modulo-schedualing.
(http://gcc.gnu.org/ml/gcc-patches/2007-01/msg01468.html).


I tested this patch on ppc64 and currently there is one difference in
one of the fortran's testcases (forall_10.f90); this tescase fails on
vanilla version and passes with this patch.  I am currently working to
find the cause for this difference.

Thanks,
Revital


(See attached file: patch_sms_12_7.txt)

Index: ddg.c
===================================================================
--- ddg.c       (revision 126552)
+++ ddg.c       (working copy)
@@ -177,17 +177,18 @@
     {
       /* Some interloop dependencies are relaxed:
         1. Every insn is output dependent on itself; ignore such deps.
-        2. Every true/flow dependence is an anti dependence in the
-        opposite direction with distance 1; such register deps
-        will be removed by renaming if broken --- ignore them.  */
-      if (!(t == OUTPUT_DEP && src_node == dest_node)
-         && !(t == ANTI_DEP && dt == REG_DEP))
+         2. Assuming a single use of any register, If there were a
+         single definition of every register we could have ignored the
+         antidependencies as every true/flow dependence is an anti
+         dependence in the opposite direction with distance 1; such
+         register deps will be removed by renaming if broken. But in
+         some cases we do see multiple definition, for instance in case
+         of long long registers so we don't remove them.  */
+      if (!(t == OUTPUT_DEP && src_node == dest_node))
        add_backarc_to_ddg (g, e);
       else
        free (e);
     }
-  else if (t == ANTI_DEP && dt == REG_DEP)
-    free (e);  /* We can fix broken anti register deps using reg-moves.  */
   else
     add_edge_to_ddg (g, e);
 }
Index: modulo-sched.c
===================================================================
--- modulo-sched.c      (revision 126552)
+++ modulo-sched.c      (working copy)
@@ -160,7 +160,6 @@
 static void free_partial_schedule (partial_schedule_ptr);
 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 void print_partial_schedule (partial_schedule_ptr, FILE *);
-static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
                                                ddg_node_ptr node, int cycle,
                                                sbitmap must_precede,
@@ -366,7 +365,7 @@
 }
 
 static void
-print_node_sched_params (FILE * file, int num_nodes)
+print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
 {
   int i;
 
@@ -378,7 +377,8 @@
       rtx reg_move = nsp->first_reg_move;
       int j;
 
-      fprintf (file, "Node %d:\n", i);
+      fprintf (dump_file, "Node = %d; INSN = %d\n", i,
+              (INSN_UID (g->nodes[i].insn)));
       fprintf (file, " asap = %d:\n", nsp->asap);
       fprintf (file, " time = %d:\n", nsp->time);
       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
@@ -391,40 +391,7 @@
     }
 }
 
-/* Calculate an upper bound for II.  SMS should not schedule the loop if it
-   requires more cycles than this bound.  Currently set to the sum of the
-   longest latency edge for each node.  Reset based on experiments.  */
-static int
-calculate_maxii (ddg_ptr g)
-{
-  int i;
-  int maxii = 0;
 
-  for (i = 0; i < g->num_nodes; i++)
-    {
-      ddg_node_ptr u = &g->nodes[i];
-      ddg_edge_ptr e;
-      int max_edge_latency = 0;
-
-      for (e = u->out; e; e = e->next_out)
-       max_edge_latency = MAX (max_edge_latency, e->latency);
-
-      maxii += max_edge_latency;
-    }
-  return maxii;
-}
-
-/*
-   Breaking intra-loop register anti-dependences:
-   Each intra-loop register anti-dependence implies a cross-iteration true
-   dependence of distance 1. Therefore, we can remove such false dependencies
-   and figure out if the partial schedule broke them by checking if (for a
-   true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
-   if so generate a register move.   The number of such moves is equal to:
-              SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
-   nreg_moves = ----------------------------------- + 1 - {   dependence.
-                            ii                          { 1 if not.
-*/
 static struct undo_replace_buff_elem *
 generate_reg_moves (partial_schedule_ptr ps, bool rescan)
 {
@@ -534,40 +501,8 @@
   return reg_move_replaces;
 }
 
-/* We call this when we want to undo the SMS schedule for a given loop.
-   One of the things that we do is to delete the register moves generated
-   for the sake of SMS; this function deletes the register move instructions
-   recorded in the undo buffer.  */
-static void
-undo_generate_reg_moves (partial_schedule_ptr ps,
-                        struct undo_replace_buff_elem *reg_move_replaces)
-{
-  int i,j;
 
-  for (i = 0; i < ps->g->num_nodes; i++)
-    {
-      ddg_node_ptr u = &ps->g->nodes[i];
-      rtx prev;
-      rtx crr = SCHED_FIRST_REG_MOVE (u);
 
-      for (j = 0; j < SCHED_NREG_MOVES (u); j++)
-       {
-         prev = PREV_INSN (crr);
-         delete_insn (crr);
-         crr = prev;
-       }
-      SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
-    }
-
-  while (reg_move_replaces)
-    {
-      struct undo_replace_buff_elem *rep = reg_move_replaces;
-
-      reg_move_replaces = reg_move_replaces->next;
-      replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
-    }
-}
-
 /* Free memory allocated for the undo buffer.  */
 static void
 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
@@ -639,29 +574,7 @@
                            PREV_INSN (last));
 }
 
-/* As part of undoing SMS we return to the original ordering of the
-   instructions inside the loop kernel.  Given the partial schedule PS, this
-   function returns the ordering of the instruction according to their CUID
-   in the DDG (PS->G), which is the original order of the instruction before
-   performing SMS.  */
 static void
-undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
-{
-  int i;
-
-  for (i = 0 ; i < ps->g->num_nodes; i++)
-    if (last == ps->g->nodes[i].insn
-       || last == ps->g->nodes[i].first_note)
-      break;
-    else if (PREV_INSN (last) != ps->g->nodes[i].insn)
-      reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
-                         PREV_INSN (last));
-}
-
-/* Used to generate the prologue & epilogue.  Duplicate the subset of
-   nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
-   of both), together with a prefix/suffix of their reg_moves.  */
-static void
 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
                           int to_stage, int for_prolog)
 {
@@ -870,6 +783,9 @@
    version may be entered.  Just a guess.  */
 #define PROB_SMS_ENOUGH_ITERATIONS 80
 
+/* Used to calculate the upper bound of ii.  */
+#define MAXII_FACTOR 2
+
 /* Main entry point, perform SMS scheduling on the loops of the function
    that consist of single basic blocks.  */
 static void
@@ -993,7 +909,8 @@
        if (CALL_P (insn)
            || BARRIER_P (insn)
            || (INSN_P (insn) && !JUMP_P (insn)
-               && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
+               && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
+            || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0))
          break;
 
       if (insn != NEXT_INSN (tail))
@@ -1004,6 +921,8 @@
                fprintf (dump_file, "SMS loop-with-call\n");
              else if (BARRIER_P (insn))
                fprintf (dump_file, "SMS loop-with-barrier\n");
+              else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
+                fprintf (dump_file, "SMS reg inc\n");
              else
                fprintf (dump_file, "SMS loop-with-not-single-set\n");
              print_rtl_single (dump_file, insn);
@@ -1093,7 +1012,7 @@
       mii = 1; /* Need to pass some estimate of mii.  */
       rec_mii = sms_order_nodes (g, mii, node_order);
       mii = MAX (res_MII (g), rec_mii);
-      maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
+      maxii = MAXII_FACTOR * mii;
 
       if (dump_file)
        fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
@@ -1127,8 +1046,6 @@
        }
       else
        {
-         int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END 
(g->bb));
-         int new_cycles;
          struct undo_replace_buff_elem *reg_move_replaces;
 
          if (dump_file)
@@ -1150,68 +1067,46 @@
          normalize_sched_times (ps);
          rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
          set_columns_for_ps (ps);
+         
+         canon_loop (loop);
 
-         /* Generate the kernel just to be able to measure its cycles.  */
-         permute_partial_schedule (ps, g->closing_branch->first_note);
-         reg_move_replaces = generate_reg_moves (ps, false);
+          /* case the BCT count is not known , Do loop-versioning */
+         if (count_reg && ! count_init)
+            {
+             rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
+                                            GEN_INT(stage_count));
+             unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
+                              * REG_BR_PROB_BASE) / 100;
 
-         /* Get the number of cycles the new kernel expect to execute in.  */
-         new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END 
(g->bb));
+             loop_version (loop, comp_rtx, &condition_bb,
+                           prob, prob, REG_BR_PROB_BASE - prob,
+                           true);
+            }
 
-         /* Get back to the original loop so we can do loop versioning.  */
-         undo_permute_partial_schedule (ps, g->closing_branch->first_note);
-         if (reg_move_replaces)
-           undo_generate_reg_moves (ps, reg_move_replaces);
+         /* Set new iteration count of loop kernel.  */
+          if (count_reg && count_init)
+           SET_SRC (single_set (count_init)) = GEN_INT (loop_count
+                                                    - stage_count + 1);
 
-         if ( new_cycles >= orig_cycles)
-           {
-             /* SMS is not profitable so undo the permutation and reg move 
generation
-                and return the kernel to its original state.  */
-             if (dump_file)
-               fprintf (dump_file, "Undoing SMS because it is not 
profitable.\n");
+         /* Now apply the scheduled kernel to the RTL of the loop.  */
+         permute_partial_schedule (ps, g->closing_branch->first_note);
 
-           }
-         else
-           {
-             canon_loop (loop);
+          /* Mark this loop as software pipelined so the later
+            scheduling passes doesn't touch it.  */
+         if (! flag_resched_modulo_sched)
+           g->bb->flags |= BB_DISABLE_SCHEDULE;
+         /* The life-info is not valid any more.  */
+         df_set_bb_dirty (g->bb);
 
-              /* case the BCT count is not known , Do loop-versioning */
-             if (count_reg && ! count_init)
-               {
-                 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
-                                                GEN_INT(stage_count));
-                 unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
-                                  * REG_BR_PROB_BASE) / 100;
-
-                 loop_version (loop, comp_rtx, &condition_bb,
-                               prob, prob, REG_BR_PROB_BASE - prob,
-                               true);
-               }
-
-             /* Set new iteration count of loop kernel.  */
-              if (count_reg && count_init)
-               SET_SRC (single_set (count_init)) = GEN_INT (loop_count
-                                                            - stage_count + 1);
-
-             /* Now apply the scheduled kernel to the RTL of the loop.  */
-             permute_partial_schedule (ps, g->closing_branch->first_note);
-
-              /* Mark this loop as software pipelined so the later
-             scheduling passes doesn't touch it.  */
-             if (! flag_resched_modulo_sched)
-               g->bb->flags |= BB_DISABLE_SCHEDULE;
-             /* The life-info is not valid any more.  */
-             df_set_bb_dirty (g->bb);
-
-             reg_move_replaces = generate_reg_moves (ps, true);
-             if (dump_file)
-               print_node_sched_params (dump_file, g->num_nodes);
-             /* Generate prolog and epilog.  */
-             if (count_reg && !count_init)
-               generate_prolog_epilog (ps, loop, count_reg);
-             else
-               generate_prolog_epilog (ps, loop, NULL_RTX);
-           }
+         reg_move_replaces = generate_reg_moves (ps, true);
+         if (dump_file)
+           print_node_sched_params (dump_file, g->num_nodes, g);
+         /* Generate prolog and epilog.  */
+         if (count_reg && !count_init)
+           generate_prolog_epilog (ps, loop, count_reg);
+         else
+           generate_prolog_epilog (ps, loop, NULL_RTX);
+           
          free_undo_replace_buff (reg_move_replaces);
        }
 
@@ -1350,8 +1245,7 @@
 
              early_start = MAX (early_start, node_st);
 
-             if (e->data_type == MEM_DEP)
-               end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+             end = MIN (end, SCHED_TIME (v_node) + ii - 1);
            }
        }
       start = early_start;
@@ -1372,8 +1266,7 @@
              late_start = MIN (late_start,
                                SCHED_TIME (v_node) - e->latency
                                + (e->distance * ii));
-             if (e->data_type == MEM_DEP)
-               end = MAX (end, SCHED_TIME (v_node) - ii + 1);
+             end = MAX (end, SCHED_TIME (v_node) - ii + 1);
            }
        }
       start = late_start;
@@ -1397,8 +1290,7 @@
              early_start = MAX (early_start,
                                 SCHED_TIME (v_node) + e->latency
                                 - (e->distance * ii));
-             if (e->data_type == MEM_DEP)
-               end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+             end = MIN (end, SCHED_TIME (v_node) + ii - 1);
            }
        }
       for (e = u_node->out; e != 0; e = e->next_out)
@@ -1410,8 +1302,7 @@
              late_start = MIN (late_start,
                                SCHED_TIME (v_node) - e->latency
                                + (e->distance * ii));
-             if (e->data_type == MEM_DEP)
-               start = MAX (start, SCHED_TIME (v_node) - ii + 1);
+             start = MAX (start, SCHED_TIME (v_node) - ii + 1);
            }
        }
       start = MAX (start, early_start);
@@ -1517,24 +1408,26 @@
            }
          /* 2. Try scheduling u in window.  */
          if (dump_file)
-           fprintf (dump_file,
-                    "Trying to schedule node %d in (%d .. %d) step %d\n",
-                    u, start, end, step);
+            fprintf (dump_file,
+                    "Trying to schedule node %d in (%d .. %d) step %d\n",
+                    u, start, end, step);
 
           /* use must_follow & must_precede bitmaps to determine order
             of nodes within the cycle.  */
           sbitmap_zero (must_precede);
           sbitmap_zero (must_follow);
          for (e = u_node->in; e != 0; e = e->next_in)
-            if (TEST_BIT (sched_nodes, e->src->cuid)
-               && e->latency == (ii * e->distance)
-               && start == SCHED_TIME (e->src))
+            if ((TEST_BIT (sched_nodes, e->src->cuid)
+               && (e->latency == (ii * e->distance)
+                    && start == SCHED_TIME (e->src)) )
+                 || (e->type != TRUE_DEP))
              SET_BIT (must_precede, e->src->cuid);
 
          for (e = u_node->out; e != 0; e = e->next_out)
-            if (TEST_BIT (sched_nodes, e->dest->cuid)
-               && e->latency == (ii * e->distance)
-               && end == SCHED_TIME (e->dest))
+            if ((TEST_BIT (sched_nodes, e->dest->cuid)
+               && (e->latency == (ii * e->distance)
+                    && start == SCHED_TIME (e->src)))
+                 || (e->type != TRUE_DEP))
              SET_BIT (must_follow, e->dest->cuid);
 
          success = 0;
@@ -2255,58 +2148,8 @@
                      targetm.sched.dfa_post_cycle_insn ());
 }
 
-/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
-   the number of cycles according to DFA that the kernel fits in,
-   we use this to check if we done well with SMS after we add
-   register moves.  In some cases register moves overhead makes
-   it even worse than the original loop.  We want SMS to be performed
-   when it gives less cycles after register moves are added.  */
-static int
-kernel_number_of_cycles (rtx first_insn, rtx last_insn)
-{
-  int cycles = 0;
-  rtx insn;
-  int can_issue_more = issue_rate;
 
-  state_reset (curr_state);
 
-  for (insn = first_insn;
-       insn != NULL_RTX && insn != last_insn;
-       insn = NEXT_INSN (insn))
-    {
-      if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
-       continue;
-
-      /* Check if there is room for the current insn.  */
-      if (!can_issue_more || state_dead_lock_p (curr_state))
-       {
-         cycles ++;
-         advance_one_cycle ();
-         can_issue_more = issue_rate;
-       }
-
-       /* Update the DFA state and return with failure if the DFA found
-          recource conflicts.  */
-      if (state_transition (curr_state, insn) >= 0)
-       {
-         cycles ++;
-         advance_one_cycle ();
-         can_issue_more = issue_rate;
-       }
-
-      if (targetm.sched.variable_issue)
-       can_issue_more =
-         targetm.sched.variable_issue (sched_dump, sched_verbose,
-                                       insn, can_issue_more);
-      /* A naked CLOBBER or USE generates no instruction, so don't
-        let them consume issue slots.  */
-      else if (GET_CODE (PATTERN (insn)) != USE
-              && GET_CODE (PATTERN (insn)) != CLOBBER)
-       can_issue_more--;
-    }
-  return cycles;
-}
-
 /* Checks if PS has resource conflicts according to DFA, starting from
    FROM cycle to TO cycle; returns true if there are conflicts and false
    if there are no conflicts.  Assumes DFA is being used.  */

[PATCH] Modulo-scheduling improvements. Patch 1 of 2. - A status update

Reply via email to