*** Context ***
The Haifa scheduler is able to recognise some cases where a dependence D
between two instructions -- a producer P and a consumer C -- can be
broken. In particular, if C contains a memory reference M and P is an
increment, then M can be replaced with its incremented version M' within C.
For instance, the following sequence:
P: a0:DI=a0:DI+0x9 ;; increment
C: s1:DI=[a0:DI] ;; memory ref M
can be replaced with:
C: s1:DI=[a0:DI+0x9] ;; inc mem ref M'
This effectively breaks dependence D.
Fine. But what about P?
*** Issue ***
In most cases, once D is broken, P will be scheduled based on other,
remaining dependences. But what if the only dependence left is a
write-after-write aka output dependence?
After scheduling, we can end up with the following sequence:
P: a0:DI=a0:DI+0x9 ;; dead code
C': a0:DI=sp:DI+0x7ff
Obviously, a0 being immediately overwritten without any intervening
instruction, P has become functionally dead code.
*** Proposed solution ***
The attached patch adds the following logic to apply_replacement. Once D
is broken, it examines P's remaining dependences: if there is only one
left and it is an output dependence, P is marked for elision. P is
actually removed only after apply_replacement has returned, to avoid
iterator invalidation.
Is it a sound approach? Any comment regarding the suggested implementation?
Thanks,
--
PA
From 179be3811fd4d8e2ece0b59948dafaeb3c0212e6 Mon Sep 17 00:00:00 2001
From: Paul-Antoine Arras <par...@baylibre.com>
Date: Tue, 17 Jun 2025 13:36:09 +0000
Subject: [PATCH] haifa-sched: Elide leftover instruction after breaking dependency [PR120459]
PR target/120459
gcc/ChangeLog:
* haifa-sched.cc (apply_replacement): Check whether the producer of the
broken dependency can be elided; return it if so.
(recompute_todo_spec): Elide instruction based on apply_replacement's
return value.
gcc/testsuite/ChangeLog:
* g++.dg/pr120459.C: New test.
---
gcc/haifa-sched.cc | 40 +++++++-
gcc/testsuite/g++.dg/pr120459.C | 158 ++++++++++++++++++++++++++++++++
2 files changed, 194 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/g++.dg/pr120459.C
diff --git gcc/haifa-sched.cc gcc/haifa-sched.cc
index 63eb06b2d..8831b0471 100644
--- gcc/haifa-sched.cc
+++ gcc/haifa-sched.cc
@@ -1186,7 +1186,7 @@ update_insn_after_change (rtx_insn *insn)
static vec<dep_t> next_cycle_replace_deps;
static vec<int> next_cycle_apply;
-static void apply_replacement (dep_t, bool);
+static rtx_insn *apply_replacement (dep_t, bool);
static void restore_pattern (dep_t, bool);
/* Look at the remaining dependencies for insn NEXT, and compute and return
@@ -1268,6 +1268,7 @@ recompute_todo_spec (rtx_insn *next, bool for_backtrack)
{
if (!dbg_cnt (sched_breakdep))
return HARD_DEP;
+ rtx_insn *insn_to_elide = NULL;
FOR_EACH_DEP (next, SD_LIST_BACK, sd_it, dep)
{
struct dep_replacement *desc = DEP_REPLACE (dep);
@@ -1276,11 +1277,22 @@ recompute_todo_spec (rtx_insn *next, bool for_backtrack)
if (desc->insn == next && !for_backtrack)
{
gcc_assert (n_replace == 1);
- apply_replacement (dep, true);
+ insn_to_elide = apply_replacement (dep, true);
}
DEP_STATUS (dep) |= DEP_CANCELLED;
}
}
+ if (insn_to_elide != NULL_RTX)
+ {
+ for (sd_it
+ = sd_iterator_start (insn_to_elide, SD_LIST_FORW | SD_LIST_BACK
+ | SD_LIST_RES_BACK);
+ sd_iterator_cond (&sd_it, &dep);)
+ {
+ sd_delete_dep (sd_it);
+ }
+ sched_remove_insn (insn_to_elide);
+ }
return 0;
}
@@ -4719,9 +4731,10 @@ free_backtrack_queue (void)
at which point we will be called again with IMMEDIATELY true. This is
only done for machines which have instruction packets with explicit
parallelism however. */
-static void
+static rtx_insn *
apply_replacement (dep_t dep, bool immediately)
{
+ rtx_insn *insn_to_elide = NULL;
struct dep_replacement *desc = DEP_REPLACE (dep);
if (!immediately && targetm.sched.exposed_pipeline && reload_completed)
{
@@ -4733,7 +4746,7 @@ apply_replacement (dep_t dep, bool immediately)
bool success;
if (QUEUE_INDEX (desc->insn) == QUEUE_SCHEDULED)
- return;
+ return NULL;
if (sched_verbose >= 5)
fprintf (sched_dump, "applying replacement for insn %d\n",
@@ -4744,6 +4757,23 @@ apply_replacement (dep_t dep, bool immediately)
rtx_insn *insn = DEP_PRO (dep);
+ /* Elide INSN if there is only one forward dependency left and it is an
+ * output dependency. */
+ dep_t dep2;
+ unsigned n_deps = 0;
+ bool has_output_dep = false;
+ sd_iterator_def sd_it;
+ FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep2)
+ {
+ n_deps++;
+ if (DEP_TYPE (dep2) == REG_DEP_OUTPUT)
+ has_output_dep = true;
+ }
+ if (n_deps == 2 && has_output_dep)
+ {
+ insn_to_elide = insn;
+ }
+
/* Recompute priority since dependent priorities may have changed. */
priority (insn, true);
update_insn_after_change (desc->insn);
@@ -4757,6 +4787,8 @@ apply_replacement (dep_t dep, bool immediately)
backtrack_queue->replace_apply.safe_push (1);
}
}
+
+ return insn_to_elide;
}
/* We have determined that a pattern involved in DEP must be restored.
diff --git gcc/testsuite/g++.dg/pr120459.C gcc/testsuite/g++.dg/pr120459.C
new file mode 100644
index 000000000..692d82b42
--- /dev/null
+++ gcc/testsuite/g++.dg/pr120459.C
@@ -0,0 +1,158 @@
+/* PR target/120459 */
+/* { dg-do compile { target { riscv*-*-* } } } */
+/* { dg-options "-O2 -std=c++03 -w -fdump-rtl-sched2-slim" } */
+
+/* Reduced from 507.cactuBSSN */
+
+struct a *b(a *, int, int);
+double *d;
+double e;
+int c(a *, int, int, int);
+void f() {
+ int *ab, *aa;
+ a *g;
+ double *h((double *)b);
+ double *l = 0, *o, *t = 0, *ad, *al;
+ int m, p, r, ag, ai, am, ao, aq, at;
+ double *n((double *)b(g, 0, m));
+ double *q((double *)b(g, 0, p));
+ double *s((double *)r);
+ double *af((double *)b(g, 0, e));
+ double *ah((double *)b(g, 0, ag));
+ double *aj((double *)b(g, 0, ai));
+ double *an((double *)b(g, 0, am));
+ double *ap((double *)b(g, 0, ao));
+ double *ar((double *)b(g, 0, aq));
+ long au = c(g, 0, 1, 0), av = c(g, 0, 0, 0), aw = sizeof(double), ax = au,
+ ay = av;
+ double bb, bc = e;
+ double be(01.0 / bb);
+ int bf(aa[1]);
+ int bg(ab[0]);
+ for (int k = 0; k < ab[2]; ++k)
+ for (int j = bf; j < ab[1]; ++j) {
+ int bh = 0;
+ for (int i = bh; i < bg; i++) {
+ long bi = i + au * j + av * k;
+ double bj = l[bi], bk = ((double *)b)[bi], bl = s[bi];
+ double bm = af[bi];
+ double bn = aj[bi];
+ double bo = al[bi];
+ double bp = an[bi];
+ double bq = ap[bi];
+ double br;
+ double bs;
+ double bt;
+ double bu;
+ double bv;
+ double bw;
+ double bx;
+ double by;
+ double bz;
+ double cb;
+ double cc;
+ double cd;
+ double cf;
+ double cg;
+ double ch;
+ double ci;
+ double ck;
+ double cm;
+ double co;
+ double cp;
+ double cq;
+ double cr;
+ switch (at)
+ case 4: {
+ bs = 21 * ((char *)&h[bi])[ax] + (&h[bi])[ax] +
+ ((char *)&h[bi])[ax * 2] - ((char *)&h[bi])[3];
+ bt = *(double *)&(&h)[0] * (&h[bi])[ax] *
+ ((char *)&h[bi])[ax * -2] +
+ (&bi)[ax] + ((char *)&h[bi])[aw * ax * 3] +
+ ((char *)&h[bi])[ax * 3];
+ bu = (&h[bi])[ay * -1] +
+ (&h[bi])[ay] * ((&bi)[ay] & ((char *)&h[bi])[ay]) -
+ (&h[bi])[ax * ay] + (&h[bi])[ay * 3];
+ bw = *(double *)&((char *)&n[bi])[ax * -1] +
+ *(double *)&((char *)&n[bi])[ax] +
+ *(double *)&((char *)&n[bi])[ax * -2] -
+ *(double *)&((char *)&n[bi])[ax * 2] -
+ *(double *)&((char *)&n[bi])[ax * -3] +
+ *(double *)&((char *)&n[bi])[ax * 3];
+ bx = (&n[bi])[0] + *(double *)&((char *)&n[bi])[ax * -1] +
+ *(double *)&((char *)&n[bi])[ax] -
+ *(double *)&((char *)&n[bi])[ax * -2] +
+ *(double *)&((char *)&n[bi])[ax * 2] +
+ *(double *)&((char *)&n[bi])[ax * -3] +
+ *(double *)&((char *)&n[bi])[ax * 3];
+ by = *(double *)&((char *)&n[bi])[ay * -1] +
+ *(double *)&((char *)&n[bi])[ay] +
+ *(double *)&((char *)&n[bi])[ay * -2] *
+ *(double *)&((char *)&n[bi])[ay * 2] -
+ *(double *)&((char *)&n[bi])[ay * -3] +
+ *(double *)&((char *)&n[bi])[ay * 3] *
+ (0 * (&n[bi])[0] +
+ 5 * *(double *)&((char *)&n[bi])[ay * -1] +
+ *(double *)&((char *)&n[bi])[ay] -
+ (*(double *)&((char *)&n[bi])[ay * -2] +
+ *(double *)&((char *)&n[bi])[ay * 2]) +
+ *(double *)&((char *)&n[bi])[ay * -3] +
+ *(double *)&((char *)&n[bi])[ay * 3]) *
+ be;
+ bz = *(double *)&((char *)0)[aw] *
+ ((&q[bi])[2] - ((char *)0)[3]) * bc;
+ cb = (&q[bi])[aw * ax] + (&q[bi])[ax] * (&q[bi])[ax * 2] +
+ (&q[bi])[aw * ax * 2] + ((char *)&q[bi])[ax];
+ cc = ((char *)&q[bi])[ay * -1] +
+ (double)((char *)&q[bi])[ay] + (&bi)[2] -
+ ((char *)&q[bi])[ay * 2] - (&q[bi])[ay];
+ cd = 20 * (&bi)[0] + (&bi)[3];
+ cf = (&bi)[ax - 1] * ((char *)&t[bi])[0] -
+ (&t[bi])[ax * 2] - (&t[bi])[ax * -3] +
+ (&t[bi])[ax * 3];
+ cg = (&t[bi])[ax] +
+ *(double *)((char *)&t[bi])[ax] * (&t[bi])[ax * -20] +
+ ((char *)0)[-3] + *(double *)((char *)0)[ax];
+ ch = 21 * ((char *)&t[bi])[ay * -1] + ((char *)&t[bi])[ay] +
+ (&t[bi])[ay * 2] * ((char *)&t[bi])[ay * 2] -
+ (&t[bi])[ay * 3] + (&bi)[ay * 3];
+ ci = ((&t[bi])[aw * ax] + (&t[bi])[ay] +
+ ((char *)0)[ay * 3]) *
+ be;
+ ck = ((char *)d)[ax] +
+ *(double *)((char *)&d[bi])[aw * ax] * ((char *)0)[2] +
+ (&d[bi])[aw * ax] + *(double *)((char *)&bi)[ax] +
+ *(double *)((char *)&d[bi])[ax] * (&d[bi])[0] +
+ (&d[bi])[ay * -1] * (&d[bi])[ay] -
+ ((&d[bi])[ay * -2] + (&d[bi])[ay * 2]) + *(&d)[ay] +
+ ((char *)&d[bi])[ay];
+ cm = (&ah[bi])[0] + *(double *)&((char *)0)[aw];
+ co = *(double *)((char *)&ah[bi])[ax * -1] +
+ ((char *)&ah[bi])[ax] * *(double *)((char *)ah)[ax] +
+ (&ah[bi])[ax * 2] + (&ah[bi])[ax] + (&ah[bi])[ax * 3];
+ cp =
+ 21 * (&ah[bi])[ay] +
+ 21 * ((char *)&ah[bi])[ay] * (&ah[bi])[ax * -2] *
+ ((char *)&ah[bi])[ay * 2] -
+ ((char *)&bi)[ay] +
+ (&ah[bi])[ax * ay] *
+ ((&ar[bi])[0] * (&ar[bi])[ax] * (&ar[bi])[aw] +
+ (&bi)[2] + (&ar[bi])[aw * ax] + (&ar[bi])[ax * 3]);
+ }
+ bk = bk * bw + by * bx * bl + bz * cc + cb * cd * cf * ch +
+ cg * ci + ck + cm * bq * cp + co + cq * 0;
+ bj = bj * bo + cr * 0 * bp * bs + bu + br * 0 + bt + bv * 0;
+ l[bi] = bj;
+ o[bi] = bk;
+ ad[bi] = bm;
+ aj[bi] = bn;
+ }
+ }
+}
+
+/* Before this PR was fixed, we got:
+ 2302: a0:DI=a0:DI+0x9
+ 2303: a0:DI=sp:DI+0x7ff
+where 2302 was obviously dead code. So we now check that this pattern is gone.
+*/
+/* { dg-final { scan-rtl-dump-not "\[0-9]+: (\[a-z]\[0-9]+):DI=\\1:DI\\+0x\[0-9]+\n \[0-9]+: \\1:DI=sp:DI\\+0x\[0-9]+" "sched2" } } */
--
2.39.5