On 22/01/2024 17:09, Richard Sandiford wrote:
> Sorry for the earlier review comment about debug insns.  I hadn't
> looked far enough into the queue to see this patch.
> 
> Alex Coplan <alex.cop...@arm.com> writes:
> > As the PR shows, we were missing code to update debug uses in the
> > load/store pair fusion pass.  This patch fixes that.
> >
> > Note that this patch depends on the following patch to create new uses
> > in RTL-SSA, submitted as part of the fixes for PR113070:
> > https://gcc.gnu.org/pipermail/gcc-patches/2024-January/642919.html
> >
> > The patch tries to give a complete treatment of the debug uses that will
> > be affected by the changes we make, and in particular makes an effort to
> > preserve debug info where possible, e.g. when re-ordering an update of
> > a base register by a constant over a debug use of that register.  When
> > re-ordering loads over a debug use of a transfer register, we reset the
> > debug insn.  Likewise when re-ordering stores over debug uses of mem.
> >
> > While doing this I noticed that try_promote_writeback used a strange
> > choice of move_range for the pair insn, in that it chose the previous
> > nondebug insn instead of the insn itself.  Since the insn is being
> > changed, these move ranges are equivalent (at least in terms of nondebug
> > insn placement as far as RTL-SSA is concerned), but I think it is more
> > natural to choose the pair insn itself.  This is needed to avoid
> > incorrectly updating some debug uses.
> >
> > Notes on testing:
> >  - The series was bootstrapped/regtested on top of the fixes for
> >    PR113070 and PR113356.  It seemed to make more sense to test with
> >    correct use/def info, and as mentioned above, this patch depends on
> >    one of the PR113070 patches.
> >  - I also ran the testsuite with -g -funroll-loops -mearly-ldp-fusion
> >    -mlate-ldp-fusion to try and flush out more issues, and worked
> >    through some examples where writeback updates were triggered to
> >    make sure it was doing the right thing.
> >  - The patches also survived an LTO+PGO bootstrap with
> >    --enable-languages=all (with the passes enabled).
> >
> > Bootstrapped/regtested as a series on aarch64-linux-gnu (with/without
> > the pass enabled).  OK for trunk?
> >
> > Thanks,
> > Alex
> >
> > gcc/ChangeLog:
> >
> >     PR target/113089
> >     * config/aarch64/aarch64-ldp-fusion.cc (reset_debug_use): New.
> >     (fixup_debug_use): New.
> >     (fixup_debug_uses_trailing_add): New.
> >     (fixup_debug_uses): New. Use it ...
> >     (ldp_bb_info::fuse_pair): ... here.
> >     (try_promote_writeback): Call fixup_debug_uses_trailing_add to
> >     fix up debug uses of the base register that are affected by
> >     folding in the trailing add insn.
> >
> > gcc/testsuite/ChangeLog:
> >
> >     PR target/113089
> >     * gcc.c-torture/compile/pr113089.c: New test.
> > ---
> >  gcc/config/aarch64/aarch64-ldp-fusion.cc      | 332 +++++++++++++++++-
> >  .../gcc.c-torture/compile/pr113089.c          |  26 ++
> >  2 files changed, 351 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.c-torture/compile/pr113089.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
> > b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> > index 4d7fd72c6b1..fd0278e7acf 100644
> > --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
> > +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> > @@ -1342,6 +1342,309 @@ ldp_bb_info::track_tombstone (int uid)
> >      gcc_unreachable (); // Bit should have changed.
> >  }
> >  
> > +// Reset the debug insn containing USE (the debug insn has been
> > +// optimized away).
> > +static void
> > +reset_debug_use (use_info *use)
> > +{
> > +  auto use_insn = use->insn ();
> > +  auto use_rtl = use_insn->rtl ();
> > +  insn_change change (use_insn);
> > +  change.new_uses = {};
> > +  INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC ();
> > +  crtl->ssa->change_insn (change);
> > +}
> > +
> > +// USE is a debug use that needs updating because DEF (a def of the same
> > +// register) is being re-ordered over it.  If BASE is non-null, then DEF
> > +// is an update of the register BASE by a constant, given by WB_OFFSET,
> > +// and we can preserve debug info by accounting for the change in side
> > +// effects.
> > +static void
> > +fixup_debug_use (obstack_watermark &attempt,
> > +            use_info *use,
> > +            def_info *def,
> > +            rtx base,
> > +            poly_int64 wb_offset)
> > +{
> > +  auto use_insn = use->insn ();
> > +  if (base)
> > +    {
> > +      auto use_rtl = use_insn->rtl ();
> > +      insn_change change (use_insn);
> > +
> > +      gcc_checking_assert (REG_P (base) && use->regno () == REGNO (base));
> > +      change.new_uses = check_remove_regno_access (attempt,
> > +                                              change.new_uses,
> > +                                              use->regno ());
> > +
> > +      // The effect of the writeback is to add WB_OFFSET to BASE.  If
> > +      // we're re-ordering DEF below USE, then we update USE by adding
> > +      // WB_OFFSET to it.  Otherwise, if we're re-ordering DEF above
> > +      // USE, we update USE by undoing the effect of the writeback
> > +      // (subtracting WB_OFFSET).
> > +      use_info *new_use;
> > +      if (*def->insn () > *use_insn)
> > +   {
> > +     // We now need USE_INSN to consume DEF.  Create a new use of DEF.
> > +     //
> > +     // N.B. this means until we call change_insns for the main change
> > +     // group we will temporarily have a debug use consuming a def that
> > +     // comes after it, but RTL-SSA doesn't currently support updating
> > +     // debug insns as part of the main change group (together with
> > +     // nondebug changes), so we will have to live with this update
> > +     // leaving the IR being temporarily inconsistent.  It seems to
> > +     // work out OK once the main change group is applied.
> > +     wb_offset *= -1;
> > +     new_use = crtl->ssa->create_use (attempt,
> > +                                      use_insn,
> > +                                      as_a<set_info *> (def));
> > +   }
> > +      else
> > +   new_use = find_access (def->insn ()->uses (), use->regno ());
> > +
> > +      change.new_uses = insert_access (attempt, new_use, change.new_uses);
> > +
> > +      if (dump_file)
> > +   {
> > +     const char *dir = (*def->insn () < *use_insn) ? "down" : "up";
> > +     pretty_printer pp;
> > +     pp_string (&pp, "[");
> > +     pp_access (&pp, use, 0);
> > +     pp_string (&pp, "]");
> > +     pp_string (&pp, " due to wb def ");
> > +     pp_string (&pp, "[");
> > +     pp_access (&pp, def, 0);
> > +     pp_string (&pp, "]");
> > +     fprintf (dump_file,
> > +              "  i%d: fix up debug use %s re-ordered %s, "
> > +              "sub r%u -> r%u + ",
> > +              use_insn->uid (), pp_formatted_text (&pp),
> > +              dir, REGNO (base), REGNO (base));
> > +     print_dec (wb_offset, dump_file);
> > +     fprintf (dump_file, "\n");
> > +   }
> > +
> > +      insn_propagation prop (use_rtl, base,
> > +                        plus_constant (GET_MODE (base), base, wb_offset));
> > +      if (prop.apply_to_pattern (&INSN_VAR_LOCATION_LOC (use_rtl)))
> > +   crtl->ssa->change_insn (change);
> > +      else
> > +   {
> > +     if (dump_file)
> > +       fprintf (dump_file, "  i%d: RTL substitution failed (%s)"
> > +                ", resetting debug insn", use_insn->uid (),
> > +                prop.failure_reason);
> > +     reset_debug_use (use);
> > +   }
> > +    }
> > +  else
> > +    {
> > +      if (dump_file)
> > +   {
> > +     pretty_printer pp;
> > +     pp_string (&pp, "[");
> > +     pp_access (&pp, use, 0);
> > +     pp_string (&pp, "] due to re-ordered load def [");
> > +     pp_access (&pp, def, 0);
> > +     pp_string (&pp, "]");
> > +     fprintf (dump_file, "  i%d: resetting debug use %s\n",
> > +              use_insn->uid (), pp_formatted_text (&pp));
> > +   }
> > +      reset_debug_use (use);
> > +    }
> > +}
> > +
> > +// Update debug uses when folding in a trailing add insn to form a
> > +// writeback pair.
> > +//
> > +// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
> > +// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
> > +// is a trailing add insn which is being folded into the pair to make it
> > +// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
> > +// TRAILING_ADD.
> > +static void
> > +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
> > +                          insn_info *pair_dst,
> > +                          insn_info *trailing_add,
> > +                          rtx writeback_effect)
> > +{
> > +  rtx base = SET_DEST (writeback_effect);
> > +
> > +  poly_int64 wb_offset;
> > +  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
> > +  gcc_checking_assert (rtx_equal_p (base, base2));
> > +
> > +  auto defs = trailing_add->defs ();
> > +  gcc_checking_assert (defs.size () == 1);
> > +  def_info *def = defs[0];
> > +
> > +  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
> > +    for (auto use : set->debug_insn_uses ())
> > +      if (*use->insn () > *pair_dst)
> > +   // DEF is getting re-ordered above USE, fix up USE accordingly.
> > +   fixup_debug_use (attempt, use, def, base, wb_offset);
> > +}
> > +
> > +// Called from fuse_pair, fixes up any debug uses that will be affected
> > +// by the changes.
> > +//
> > +// ATTEMPT is the obstack watermark used to allocate RTL-SSA temporaries 
> > for
> > +// the changes, INSNS gives the candidate insns: at this point the use/def
> > +// information should still be as on entry to fuse_pair, but the patterns 
> > may
> > +// have changed, hence we pass ORIG_RTL which contains the original 
> > patterns
> > +// for the candidate insns.
> > +//
> > +// The final pair will be placed immediately after PAIR_DST, LOAD_P is 
> > true if
> > +// it is a load pair, bit I of WRITEBACK is set if INSNS[I] originally had
> > +// writeback, and WRITEBACK_EFFECT is an rtx describing the overall update 
> > to
> > +// the base register in the final pair (if any).  BASE_REGNO gives the 
> > register
> > +// number of the base register used in the final pair.
> > +static void
> > +fixup_debug_uses (obstack_watermark &attempt,
> > +             insn_info *insns[2],
> > +             rtx orig_rtl[2],
> > +             insn_info *pair_dst,
> > +             insn_info *trailing_add,
> > +             bool load_p,
> > +             int writeback,
> > +             rtx writeback_effect,
> > +             unsigned base_regno)
> > +{
> > +  // USE is a debug use that needs updating because DEF (a def of the
> > +  // resource) is being re-ordered over it.  If WRITEBACK_PAT is non-NULL,
> > +  // then it gives the original RTL pattern for DEF's insn, and DEF is a
> > +  // writeback update of the base register.
> > +  //
> > +  // This simply unpacks WRITEBACK_PAT if needed and calls fixup_debug_use.
> > +  auto update_debug_use = [&](use_info *use, def_info *def,
> > +                         rtx writeback_pat)
> > +    {
> > +      poly_int64 offset = 0;
> > +      rtx base = NULL_RTX;
> > +      if (writeback_pat)
> > +   {
> > +     rtx mem = XEXP (writeback_pat, load_p);
> > +     gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
> > +                          == RTX_AUTOINC);
> > +
> > +     base = ldp_strip_offset (mem, &offset);
> > +     gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
> > +   }
> > +      fixup_debug_use (attempt, use, def, base, offset);
> > +    };
> > +
> > +  // Reset any debug uses of mem over which we re-ordered a store.
> > +  //
> > +  // It would be nice to try and preserve debug info here, but it seems 
> > that
> > +  // would require doing alias analysis to see if the store aliases with 
> > the
> > +  // debug use, which seems a little extravagant just to preserve debug 
> > info.
> > +  if (!load_p)
> > +    {
> > +      auto def = memory_access (insns[0]->defs ());
> > +      auto last_def = memory_access (insns[1]->defs ());
> > +      for (; def != last_def; def = def->next_def ())
> > +   if (auto set = dyn_cast<set_info *> (def))
> 
> Similarly to the review of the other patch, I think we should use
> an unconditional as_a here.

Sounds good, I'll make that change.

> 
> > +     for (auto use : set->debug_insn_uses ())
> > +       {
> > +         if (dump_file)
> > +           fprintf (dump_file, "  i%d: resetting debug use of mem\n",
> > +                    use->insn ()->uid ());
> > +         reset_debug_use (use);
> > +       }
> > +    }
> > +
> > +  // Now let's take care of register uses, starting with debug uses
> > +  // attached to defs from our first insn.
> > +  for (auto def : insns[0]->defs ())
> > +    {
> > +      auto set = dyn_cast<set_info *> (def);
> > +      if (!set || set->is_mem () || !set->first_debug_insn_use ())
> > +   continue;
> > +
> > +      def_info *defs[2] = {
> > +   def,
> > +   find_access (insns[1]->defs (), def->regno ())
> > +      };
> > +
> > +      rtx writeback_pats[2] = {};
> > +      if (def->regno () == base_regno)
> > +   for (int i = 0; i < 2; i++)
> > +     if (defs[i] && (writeback & (1 << i)))
> 
> Are both checks needed?  If so, what does it mean for the writeback
> bit to be set but defs[i] to be null?  Could we assert the defs[i]
> part instead?

Yeah, we could assert the defs[i] bit, (writeback & (1 << i)) should imply
defs[i] is non-null.  I'll make that change, thanks.

> 
> > +       writeback_pats[i] = orig_rtl[i];
> > +
> > +      // Now that we've characterized the defs involved, go through the
> > +      // debug uses and determine how to update them (if needed).
> > +      for (auto use : set->debug_insn_uses ())
> > +   {
> > +     if (*pair_dst < *use->insn () && defs[1])
> > +       // We're re-ordering defs[1] above a previous use of the
> > +       // same resource.
> > +       update_debug_use (use, defs[1], writeback_pats[1]);
> > +     else if (*pair_dst >= *use->insn ())
> > +       // We're re-ordering defs[0] below its use.
> > +       update_debug_use (use, defs[0], writeback_pats[0]);
> > +   }
> > +    }
> > +
> > +  // Now let's look at registers which are def'd by the second insn
> > +  // but not by the first insn, there may still be debug uses of a
> > +  // previous def which can be affected by moving the second insn up.
> > +  for (auto def : insns[1]->defs ())
> > +    {
> > +      // This should be M log N where N is the number of defs in
> > +      // insns[0] and M is the number of defs in insns[1].
> > +      if (def->is_mem () || find_access (insns[0]->defs (), def->regno ()))
> > +     continue;
> > +
> > +      auto prev_set = safe_dyn_cast<set_info *> (def->prev_def ());
> > +      if (!prev_set)
> > +   continue;
> > +
> > +      rtx writeback_pat = NULL_RTX;
> > +      if (def->regno () == base_regno && (writeback & 2))
> > +   writeback_pat = orig_rtl[1];
> > +
> > +      // We have a def in insns[1] which isn't def'd by the first insn.
> > +      // Look to the previous def and see if it has any debug uses.
> > +      for (auto use : prev_set->debug_insn_uses ())
> > +   if (*pair_dst < *use->insn ())
> > +     // We're ordering DEF above a previous use of the same register.
> > +     update_debug_use (use, def, writeback_pat);
> 
> This might be more efficient as a reverse walk that breaks as soon as
> *pair_dst >= *use->insn (), since the previous def could be arbitrarily
> far away and have arbitrarily many leading uses.  But it probably doesn't
> matter much in practice.

Agreed, provided it's easy to get at the last debug use in constant time
(if so I guess 1/3 might need updating to add a last_debug_insn_use
accessor).  I'll look into that tomorrow.

> 
> > +    }
> > +
> > +  if ((writeback & 2) && !writeback_effect)
> > +    {
> > +      // If the second insn initially had writeback but the final
> > +      // pair does not, then there may be trailing debug uses of the
> > +      // second writeback def which need re-parenting: do that.
> > +      auto def = find_access (insns[1]->defs (), base_regno);
> > +      gcc_assert (def);
> > +      for (auto use : as_a<set_info *> (def)->debug_insn_uses ())
> > +   {
> > +     insn_change change (use->insn ());
> > +     change.new_uses = check_remove_regno_access (attempt,
> > +                                                  change.new_uses,
> > +                                                  base_regno);
> > +     auto new_use = find_access (insns[0]->uses (), base_regno);
> > +
> > +     // N.B. insns must have already shared a common base due to writeback.
> > +     gcc_assert (new_use);
> > +
> > +     if (dump_file)
> > +       fprintf (dump_file,
> > +                "  i%d: cancelling wb, re-parenting trailing debug use\n",
> > +                use->insn ()->uid ());
> > +
> > +     change.new_uses = insert_access (attempt, new_use, change.new_uses);
> > +     crtl->ssa->change_insn (change);
> > +   }
> > +    }
> > +  else if (trailing_add)
> > +    fixup_debug_uses_trailing_add (attempt, pair_dst, trailing_add,
> > +                              writeback_effect);
> > +}
> > +
> >  // Try and actually fuse the pair given by insns I1 and I2.
> >  //
> >  // Here we've done enough analysis to know this is safe, we only
> > @@ -1378,6 +1681,9 @@ ldp_bb_info::fuse_pair (bool load_p,
> >    insn_info *first = (*i1 < *i2) ? i1 : i2;
> >    insn_info *second = (first == i1) ? i2 : i1;
> >  
> > +  insn_info *pair_dst = move_range.singleton ();
> > +  gcc_assert (pair_dst);
> > +
> >    insn_info *insns[2] = { first, second };
> >  
> >    auto_vec<insn_change *> changes;
> > @@ -1388,6 +1694,13 @@ ldp_bb_info::fuse_pair (bool load_p,
> >      PATTERN (second->rtl ())
> >    };
> >  
> > +  // Make copies of the patterns as we might need to refer to the original 
> > RTL
> > +  // later, for example when updating debug uses (which is after we've 
> > updated
> > +  // one or both of the patterns in the candidate insns).
> > +  rtx orig_rtl[2];
> > +  for (int i = 0; i < 2; i++)
> > +    orig_rtl[i] = copy_rtx (pats[i]);
> > +
> 
> FWIW, an alternative (that avoids RTL allocation) would be to use
> temporarily_undo_changes and redo_changes.  But this is fine too.

Presumably using temporarily_undo_changes would require some bookkeping
around the change numbers that might end up making things more
complicated?

I thought about making the copies conditional on MAY_HAVE_DEBUG_INSNS as
that at least saves copying in the common case with no debug insns (and
perhaps value-initializing orig_rtl to avoid it being used uninitialized
by potential future patches).  WDYT about that?

> 
> The patch is OK from my POV with the as_a change above, but I'd be
> interested in the answer to the writeback question above.

Great, thanks a lot for the reviews!

Alex

> 
> Thanks,
> Richard
> 
> >    use_array input_uses[2] = { first->uses (), second->uses () };
> >    def_array input_defs[2] = { first->defs (), second->defs () };
> >  
> > @@ -1604,9 +1917,7 @@ ldp_bb_info::fuse_pair (bool load_p,
> >        using Action = stp_change_builder::action;
> >        insn_info *store_to_change = try_repurpose_store (first, second,
> >                                                     move_range);
> > -      insn_info *stp_dest = move_range.singleton ();
> > -      gcc_assert (stp_dest);
> > -      stp_change_builder builder (insns, store_to_change, stp_dest);
> > +      stp_change_builder builder (insns, store_to_change, pair_dst);
> >        insn_change *change;
> >        set_info *new_set = nullptr;
> >        for (; !builder.done (); builder.advance ())
> > @@ -1677,7 +1988,7 @@ ldp_bb_info::fuse_pair (bool load_p,
> >             fprintf (dump_file,
> >                      "  stp: changing i%d to use mem from new stp "
> >                      "(after i%d)\n",
> > -                    action.insn->uid (), stp_dest->uid ());
> > +                    action.insn->uid (), pair_dst->uid ());
> >           change->new_uses = drop_memory_access (change->new_uses);
> >           gcc_assert (new_set);
> >           auto new_use = crtl->ssa->create_use (attempt, action.insn,
> > @@ -1741,6 +2052,11 @@ ldp_bb_info::fuse_pair (bool load_p,
> >  
> >    gcc_assert (crtl->ssa->verify_insn_changes (changes));
> >  
> > +  // Fix up any debug uses that will be affected by the changes.
> > +  if (MAY_HAVE_DEBUG_INSNS)
> > +    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
> > +                 load_p, writeback, writeback_effect, base_regno);
> > +
> >    confirm_change_group ();
> >    crtl->ssa->change_insns (changes);
> >  
> > @@ -2807,7 +3123,7 @@ try_promote_writeback (insn_info *insn)
> >  
> >    rtx wb_effect = NULL_RTX;
> >    def_info *add_def;
> > -  const insn_range_info pair_range (insn->prev_nondebug_insn ());
> > +  const insn_range_info pair_range (insn);
> >    insn_info *insns[2] = { nullptr, insn };
> >    insn_info *trailing_add = find_trailing_add (insns, pair_range, 0, 
> > &wb_effect,
> >                                            &add_def, base_def, offset,
> > @@ -2830,14 +3146,16 @@ try_promote_writeback (insn_info *insn)
> >                                     pair_change.new_defs);
> >    gcc_assert (pair_change.new_defs.is_valid ());
> >  
> > -  pair_change.move_range = insn_range_info (insn->prev_nondebug_insn ());
> > -
> >    auto is_changing = insn_is_changing (changes);
> >    for (unsigned i = 0; i < ARRAY_SIZE (changes); i++)
> >      gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], 
> > is_changing));
> >  
> >    gcc_assert (rtl_ssa::recog_ignoring (attempt, pair_change, is_changing));
> >    gcc_assert (crtl->ssa->verify_insn_changes (changes));
> > +
> > +  if (MAY_HAVE_DEBUG_INSNS)
> > +    fixup_debug_uses_trailing_add (attempt, insn, trailing_add, wb_effect);
> > +
> >    confirm_change_group ();
> >    crtl->ssa->change_insns (changes);
> >  }
> > diff --git a/gcc/testsuite/gcc.c-torture/compile/pr113089.c 
> > b/gcc/testsuite/gcc.c-torture/compile/pr113089.c
> > new file mode 100644
> > index 00000000000..70c71f23f1c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.c-torture/compile/pr113089.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-g -funroll-loops" } */
> > +
> > +typedef unsigned short uint16;
> > +
> > +void intrapred_chroma_plane(uint16 ***mb_preds, int* max_imgpel_values, 
> > int crx, int cry, int px) {
> > +  for (int uv = 0; uv < 2; uv++) {
> > +    uint16 **mb_pred = mb_preds[uv + 1];
> > +    uint16 **predU2 = &mb_pred[px - 2];
> > +    uint16 *upPred = &mb_pred[px][px];
> > +    int max_imgpel_value = max_imgpel_values[uv];
> > +
> > +    int ih = upPred[crx - 1];
> > +    for (int i = 0; i < crx*3; ++i)
> > +      ih += upPred[crx*3];
> > +
> > +    int iv = (mb_pred[cry - 1][px+1]);
> > +    for (int i = 0; i < cry - 1; ++i) {
> > +      iv += (i + 1) * (*(mb_preds[uv][0]) - *(*predU2--));
> > +    }
> > +
> > +    for (int j = 0; j < cry; ++j)
> > +      for (int i = 0; i < crx; ++i)
> > +        mb_pred[j][i] = (uint16) (max_imgpel_value * ((i * ih + iv)));
> > +  }
> > +}

Reply via email to