Hello,
I wanted to update the status of the first patch that
Vladimir had posted to improve modulo-schedualing.
(http://gcc.gnu.org/ml/gcc-patches/2007-01/msg01468.html).
I tested this patch on ppc64 and currently there is one difference in
one of the fortran's testcases (forall_10.f90); this tescase fails on
vanilla version and passes with this patch. I am currently working to
find the cause for this difference.
Thanks,
Revital
(See attached file: patch_sms_12_7.txt)
Index: ddg.c
===================================================================
--- ddg.c (revision 126552)
+++ ddg.c (working copy)
@@ -177,17 +177,18 @@
{
/* Some interloop dependencies are relaxed:
1. Every insn is output dependent on itself; ignore such deps.
- 2. Every true/flow dependence is an anti dependence in the
- opposite direction with distance 1; such register deps
- will be removed by renaming if broken --- ignore them. */
- if (!(t == OUTPUT_DEP && src_node == dest_node)
- && !(t == ANTI_DEP && dt == REG_DEP))
+ 2. Assuming a single use of any register, If there were a
+ single definition of every register we could have ignored the
+ antidependencies as every true/flow dependence is an anti
+ dependence in the opposite direction with distance 1; such
+ register deps will be removed by renaming if broken. But in
+ some cases we do see multiple definition, for instance in case
+ of long long registers so we don't remove them. */
+ if (!(t == OUTPUT_DEP && src_node == dest_node))
add_backarc_to_ddg (g, e);
else
free (e);
}
- else if (t == ANTI_DEP && dt == REG_DEP)
- free (e); /* We can fix broken anti register deps using reg-moves. */
else
add_edge_to_ddg (g, e);
}
Index: modulo-sched.c
===================================================================
--- modulo-sched.c (revision 126552)
+++ modulo-sched.c (working copy)
@@ -160,7 +160,6 @@
static void free_partial_schedule (partial_schedule_ptr);
static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
void print_partial_schedule (partial_schedule_ptr, FILE *);
-static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
ddg_node_ptr node, int cycle,
sbitmap must_precede,
@@ -366,7 +365,7 @@
}
static void
-print_node_sched_params (FILE * file, int num_nodes)
+print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
{
int i;
@@ -378,7 +377,8 @@
rtx reg_move = nsp->first_reg_move;
int j;
- fprintf (file, "Node %d:\n", i);
+ fprintf (dump_file, "Node = %d; INSN = %d\n", i,
+ (INSN_UID (g->nodes[i].insn)));
fprintf (file, " asap = %d:\n", nsp->asap);
fprintf (file, " time = %d:\n", nsp->time);
fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
@@ -391,40 +391,7 @@
}
}
-/* Calculate an upper bound for II. SMS should not schedule the loop if it
- requires more cycles than this bound. Currently set to the sum of the
- longest latency edge for each node. Reset based on experiments. */
-static int
-calculate_maxii (ddg_ptr g)
-{
- int i;
- int maxii = 0;
- for (i = 0; i < g->num_nodes; i++)
- {
- ddg_node_ptr u = &g->nodes[i];
- ddg_edge_ptr e;
- int max_edge_latency = 0;
-
- for (e = u->out; e; e = e->next_out)
- max_edge_latency = MAX (max_edge_latency, e->latency);
-
- maxii += max_edge_latency;
- }
- return maxii;
-}
-
-/*
- Breaking intra-loop register anti-dependences:
- Each intra-loop register anti-dependence implies a cross-iteration true
- dependence of distance 1. Therefore, we can remove such false dependencies
- and figure out if the partial schedule broke them by checking if (for a
- true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
- if so generate a register move. The number of such moves is equal to:
- SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
- nreg_moves = ----------------------------------- + 1 - { dependence.
- ii { 1 if not.
-*/
static struct undo_replace_buff_elem *
generate_reg_moves (partial_schedule_ptr ps, bool rescan)
{
@@ -534,40 +501,8 @@
return reg_move_replaces;
}
-/* We call this when we want to undo the SMS schedule for a given loop.
- One of the things that we do is to delete the register moves generated
- for the sake of SMS; this function deletes the register move instructions
- recorded in the undo buffer. */
-static void
-undo_generate_reg_moves (partial_schedule_ptr ps,
- struct undo_replace_buff_elem *reg_move_replaces)
-{
- int i,j;
- for (i = 0; i < ps->g->num_nodes; i++)
- {
- ddg_node_ptr u = &ps->g->nodes[i];
- rtx prev;
- rtx crr = SCHED_FIRST_REG_MOVE (u);
- for (j = 0; j < SCHED_NREG_MOVES (u); j++)
- {
- prev = PREV_INSN (crr);
- delete_insn (crr);
- crr = prev;
- }
- SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
- }
-
- while (reg_move_replaces)
- {
- struct undo_replace_buff_elem *rep = reg_move_replaces;
-
- reg_move_replaces = reg_move_replaces->next;
- replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
- }
-}
-
/* Free memory allocated for the undo buffer. */
static void
free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
@@ -639,29 +574,7 @@
PREV_INSN (last));
}
-/* As part of undoing SMS we return to the original ordering of the
- instructions inside the loop kernel. Given the partial schedule PS, this
- function returns the ordering of the instruction according to their CUID
- in the DDG (PS->G), which is the original order of the instruction before
- performing SMS. */
static void
-undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
-{
- int i;
-
- for (i = 0 ; i < ps->g->num_nodes; i++)
- if (last == ps->g->nodes[i].insn
- || last == ps->g->nodes[i].first_note)
- break;
- else if (PREV_INSN (last) != ps->g->nodes[i].insn)
- reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
- PREV_INSN (last));
-}
-
-/* Used to generate the prologue & epilogue. Duplicate the subset of
- nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
- of both), together with a prefix/suffix of their reg_moves. */
-static void
duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
int to_stage, int for_prolog)
{
@@ -870,6 +783,9 @@
version may be entered. Just a guess. */
#define PROB_SMS_ENOUGH_ITERATIONS 80
+/* Used to calculate the upper bound of ii. */
+#define MAXII_FACTOR 2
+
/* Main entry point, perform SMS scheduling on the loops of the function
that consist of single basic blocks. */
static void
@@ -993,7 +909,8 @@
if (CALL_P (insn)
|| BARRIER_P (insn)
|| (INSN_P (insn) && !JUMP_P (insn)
- && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
+ && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
+ || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0))
break;
if (insn != NEXT_INSN (tail))
@@ -1004,6 +921,8 @@
fprintf (dump_file, "SMS loop-with-call\n");
else if (BARRIER_P (insn))
fprintf (dump_file, "SMS loop-with-barrier\n");
+ else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
+ fprintf (dump_file, "SMS reg inc\n");
else
fprintf (dump_file, "SMS loop-with-not-single-set\n");
print_rtl_single (dump_file, insn);
@@ -1093,7 +1012,7 @@
mii = 1; /* Need to pass some estimate of mii. */
rec_mii = sms_order_nodes (g, mii, node_order);
mii = MAX (res_MII (g), rec_mii);
- maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
+ maxii = MAXII_FACTOR * mii;
if (dump_file)
fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
@@ -1127,8 +1046,6 @@
}
else
{
- int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END
(g->bb));
- int new_cycles;
struct undo_replace_buff_elem *reg_move_replaces;
if (dump_file)
@@ -1150,68 +1067,46 @@
normalize_sched_times (ps);
rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
set_columns_for_ps (ps);
+
+ canon_loop (loop);
- /* Generate the kernel just to be able to measure its cycles. */
- permute_partial_schedule (ps, g->closing_branch->first_note);
- reg_move_replaces = generate_reg_moves (ps, false);
+ /* case the BCT count is not known , Do loop-versioning */
+ if (count_reg && ! count_init)
+ {
+ rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
+ GEN_INT(stage_count));
+ unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
+ * REG_BR_PROB_BASE) / 100;
- /* Get the number of cycles the new kernel expect to execute in. */
- new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END
(g->bb));
+ loop_version (loop, comp_rtx, &condition_bb,
+ prob, prob, REG_BR_PROB_BASE - prob,
+ true);
+ }
- /* Get back to the original loop so we can do loop versioning. */
- undo_permute_partial_schedule (ps, g->closing_branch->first_note);
- if (reg_move_replaces)
- undo_generate_reg_moves (ps, reg_move_replaces);
+ /* Set new iteration count of loop kernel. */
+ if (count_reg && count_init)
+ SET_SRC (single_set (count_init)) = GEN_INT (loop_count
+ - stage_count + 1);
- if ( new_cycles >= orig_cycles)
- {
- /* SMS is not profitable so undo the permutation and reg move
generation
- and return the kernel to its original state. */
- if (dump_file)
- fprintf (dump_file, "Undoing SMS because it is not
profitable.\n");
+ /* Now apply the scheduled kernel to the RTL of the loop. */
+ permute_partial_schedule (ps, g->closing_branch->first_note);
- }
- else
- {
- canon_loop (loop);
+ /* Mark this loop as software pipelined so the later
+ scheduling passes doesn't touch it. */
+ if (! flag_resched_modulo_sched)
+ g->bb->flags |= BB_DISABLE_SCHEDULE;
+ /* The life-info is not valid any more. */
+ df_set_bb_dirty (g->bb);
- /* case the BCT count is not known , Do loop-versioning */
- if (count_reg && ! count_init)
- {
- rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
- GEN_INT(stage_count));
- unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
- * REG_BR_PROB_BASE) / 100;
-
- loop_version (loop, comp_rtx, &condition_bb,
- prob, prob, REG_BR_PROB_BASE - prob,
- true);
- }
-
- /* Set new iteration count of loop kernel. */
- if (count_reg && count_init)
- SET_SRC (single_set (count_init)) = GEN_INT (loop_count
- - stage_count + 1);
-
- /* Now apply the scheduled kernel to the RTL of the loop. */
- permute_partial_schedule (ps, g->closing_branch->first_note);
-
- /* Mark this loop as software pipelined so the later
- scheduling passes doesn't touch it. */
- if (! flag_resched_modulo_sched)
- g->bb->flags |= BB_DISABLE_SCHEDULE;
- /* The life-info is not valid any more. */
- df_set_bb_dirty (g->bb);
-
- reg_move_replaces = generate_reg_moves (ps, true);
- if (dump_file)
- print_node_sched_params (dump_file, g->num_nodes);
- /* Generate prolog and epilog. */
- if (count_reg && !count_init)
- generate_prolog_epilog (ps, loop, count_reg);
- else
- generate_prolog_epilog (ps, loop, NULL_RTX);
- }
+ reg_move_replaces = generate_reg_moves (ps, true);
+ if (dump_file)
+ print_node_sched_params (dump_file, g->num_nodes, g);
+ /* Generate prolog and epilog. */
+ if (count_reg && !count_init)
+ generate_prolog_epilog (ps, loop, count_reg);
+ else
+ generate_prolog_epilog (ps, loop, NULL_RTX);
+
free_undo_replace_buff (reg_move_replaces);
}
@@ -1350,8 +1245,7 @@
early_start = MAX (early_start, node_st);
- if (e->data_type == MEM_DEP)
- end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+ end = MIN (end, SCHED_TIME (v_node) + ii - 1);
}
}
start = early_start;
@@ -1372,8 +1266,7 @@
late_start = MIN (late_start,
SCHED_TIME (v_node) - e->latency
+ (e->distance * ii));
- if (e->data_type == MEM_DEP)
- end = MAX (end, SCHED_TIME (v_node) - ii + 1);
+ end = MAX (end, SCHED_TIME (v_node) - ii + 1);
}
}
start = late_start;
@@ -1397,8 +1290,7 @@
early_start = MAX (early_start,
SCHED_TIME (v_node) + e->latency
- (e->distance * ii));
- if (e->data_type == MEM_DEP)
- end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+ end = MIN (end, SCHED_TIME (v_node) + ii - 1);
}
}
for (e = u_node->out; e != 0; e = e->next_out)
@@ -1410,8 +1302,7 @@
late_start = MIN (late_start,
SCHED_TIME (v_node) - e->latency
+ (e->distance * ii));
- if (e->data_type == MEM_DEP)
- start = MAX (start, SCHED_TIME (v_node) - ii + 1);
+ start = MAX (start, SCHED_TIME (v_node) - ii + 1);
}
}
start = MAX (start, early_start);
@@ -1517,24 +1408,26 @@
}
/* 2. Try scheduling u in window. */
if (dump_file)
- fprintf (dump_file,
- "Trying to schedule node %d in (%d .. %d) step %d\n",
- u, start, end, step);
+ fprintf (dump_file,
+ "Trying to schedule node %d in (%d .. %d) step %d\n",
+ u, start, end, step);
/* use must_follow & must_precede bitmaps to determine order
of nodes within the cycle. */
sbitmap_zero (must_precede);
sbitmap_zero (must_follow);
for (e = u_node->in; e != 0; e = e->next_in)
- if (TEST_BIT (sched_nodes, e->src->cuid)
- && e->latency == (ii * e->distance)
- && start == SCHED_TIME (e->src))
+ if ((TEST_BIT (sched_nodes, e->src->cuid)
+ && (e->latency == (ii * e->distance)
+ && start == SCHED_TIME (e->src)) )
+ || (e->type != TRUE_DEP))
SET_BIT (must_precede, e->src->cuid);
for (e = u_node->out; e != 0; e = e->next_out)
- if (TEST_BIT (sched_nodes, e->dest->cuid)
- && e->latency == (ii * e->distance)
- && end == SCHED_TIME (e->dest))
+ if ((TEST_BIT (sched_nodes, e->dest->cuid)
+ && (e->latency == (ii * e->distance)
+ && start == SCHED_TIME (e->src)))
+ || (e->type != TRUE_DEP))
SET_BIT (must_follow, e->dest->cuid);
success = 0;
@@ -2255,58 +2148,8 @@
targetm.sched.dfa_post_cycle_insn ());
}
-/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
- the number of cycles according to DFA that the kernel fits in,
- we use this to check if we done well with SMS after we add
- register moves. In some cases register moves overhead makes
- it even worse than the original loop. We want SMS to be performed
- when it gives less cycles after register moves are added. */
-static int
-kernel_number_of_cycles (rtx first_insn, rtx last_insn)
-{
- int cycles = 0;
- rtx insn;
- int can_issue_more = issue_rate;
- state_reset (curr_state);
- for (insn = first_insn;
- insn != NULL_RTX && insn != last_insn;
- insn = NEXT_INSN (insn))
- {
- if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
- continue;
-
- /* Check if there is room for the current insn. */
- if (!can_issue_more || state_dead_lock_p (curr_state))
- {
- cycles ++;
- advance_one_cycle ();
- can_issue_more = issue_rate;
- }
-
- /* Update the DFA state and return with failure if the DFA found
- recource conflicts. */
- if (state_transition (curr_state, insn) >= 0)
- {
- cycles ++;
- advance_one_cycle ();
- can_issue_more = issue_rate;
- }
-
- if (targetm.sched.variable_issue)
- can_issue_more =
- targetm.sched.variable_issue (sched_dump, sched_verbose,
- insn, can_issue_more);
- /* A naked CLOBBER or USE generates no instruction, so don't
- let them consume issue slots. */
- else if (GET_CODE (PATTERN (insn)) != USE
- && GET_CODE (PATTERN (insn)) != CLOBBER)
- can_issue_more--;
- }
- return cycles;
-}
-
/* Checks if PS has resource conflicts according to DFA, starting from
FROM cycle to TO cycle; returns true if there are conflicts and false
if there are no conflicts. Assumes DFA is being used. */