From: Artemiy Volkov <arte...@synopsys.com>

Presently, the scheduler code only considers consecutive instructions
for macro-op fusion (see sched-deps.cc::sched_macro_fuse_insns () for
details).  This patch introduces the new dep_fusion pass, which is
intended to uncover more fusion opportunities by reordering eligible
instructions to form fusible pairs (based solely on the value of the
TARGET_SCHED_MACRO_FUSION_PAIR_P hook).  This is achieved by using
the RTL-SSA framework, and only the single-use instructions are
considered for the first instruction of a pair.

Aside from reordering instructions, this pass also sets the SCHED_GROUP
flag for the second instruction so that following passes can implement
special handling of the fused pairs.  For instance, RA and regrename
should make use of this information to preserve single-output property
for some of such pairs.  Accordingly, in passes.def, this patch adds two
invocations of the new pass: just before IRA and just before regrename.

The new pass is enabled at -O2+ and -Os.

gcc/ChangeLog:

        * Makefile.in (OBJS): Add dep-fusion.o.
        * common.opt (fdep-fusion): Add option.
        * dep-fusion.cc: New pass.
        * doc/invoke.texi: Document it.
        * opts.cc (default_options_table): Enable it at -O2+ and -Os.
        * passes.def: Insert two instances of dep_fusion.
        * tree-pass.h (make_pass_dep_fusion): Declare new function.

Suggested-by: Jeff Law <j...@ventanamicro.com>
Signed-off-by: Artemiy Volkov <artem...@acm.org>
---
 gcc/Makefile.in     |   1 +
 gcc/common.opt      |   4 ++
 gcc/dep-fusion.cc   | 148 ++++++++++++++++++++++++++++++++++++++++++++
 gcc/doc/invoke.texi |  15 ++++-
 gcc/opts.cc         |   1 +
 gcc/passes.def      |   2 +
 gcc/tree-pass.h     |   1 +
 7 files changed, 169 insertions(+), 3 deletions(-)
 create mode 100644 gcc/dep-fusion.cc

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 7314a3b4225..fd6c08e826e 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1450,6 +1450,7 @@ OBJS = \
        dce.o \
        ddg.o \
        debug.o \
+       dep-fusion.o \
        df-core.o \
        df-problems.o \
        df-scan.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 70659fabebd..2440c72c135 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1392,6 +1392,10 @@ fdelete-null-pointer-checks
 Common Var(flag_delete_null_pointer_checks) Init(-1) Optimization
 Delete useless null pointer checks.
 
+fdep-fusion
+Common Var(flag_dep_fusion) Optimization Init(1)
+Issue defining instructions back to back with their single uses, provided they 
are macro-fusible in the target microarchitecture.
+
 fdevirtualize-at-ltrans
 Common Var(flag_ltrans_devirtualize)
 Stream extra data to support more aggressive devirtualization in LTO local 
transformation mode.
diff --git a/gcc/dep-fusion.cc b/gcc/dep-fusion.cc
new file mode 100644
index 00000000000..1e69e68dd87
--- /dev/null
+++ b/gcc/dep-fusion.cc
@@ -0,0 +1,148 @@
+// Dependency fusion reordering pass.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+//
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+//
+// This pass uses the RTL-SSA representation to detect def-use pairs that are
+// macro-op-fusible in the current microarchitecture (using the
+// macro_fusion_pair_p () target hook) and place them next to one another, if
+// possible.
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_MEMORY
+#define INCLUDE_ARRAY
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-ssa.h"
+#include "print-rtl.h"
+#include "tree-pass.h"
+#include "cfgcleanup.h"
+#include "target.h"
+#include "dbgcnt.h"
+
+namespace {
+const pass_data pass_data_dep_fusion =
+{
+  RTL_PASS, // type
+  "dep_fusion", // name
+  OPTGROUP_NONE, // optinfo_flags
+  TV_NONE, // tv_id
+  0, // properties_required
+  0, // properties_provided
+  0, // properties_destroyed
+  0, // todo_flags_start
+  TODO_df_finish, // todo_flags_finish
+};
+
+class pass_dep_fusion : public rtl_opt_pass
+{
+public:
+  pass_dep_fusion (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_dep_fusion, ctxt)
+  {}
+
+  // opt_pass methods:
+  opt_pass *clone () override { return new pass_dep_fusion (m_ctxt); }
+  bool gate (function *) override;
+  unsigned int execute (function *) override;
+};
+
+bool
+pass_dep_fusion::gate (function *)
+{
+  return optimize > 0 && flag_dep_fusion;
+}
+
+unsigned int
+pass_dep_fusion::execute (function *fn)
+{
+  // Initialization.
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_analyze ();
+  crtl->ssa = new rtl_ssa::function_info (fn);
+
+  init_recog_no_volatile ();
+
+  for (rtl_ssa::insn_info *insn = *crtl->ssa->nondebug_insns ().begin ();
+       insn;
+       insn = insn->next_nondebug_insn ())
+    {
+      if (!insn->can_be_optimized () || insn->num_defs () != 1)
+       continue;
+
+      rtl_ssa::set_info *def = single_set_info (insn);
+      if (!def)
+       continue;
+
+      rtl_ssa::use_info *use_insn = def->single_nondebug_insn_use ();
+      if (!use_insn
+         || !use_insn->insn ()->can_be_optimized ()
+         || !targetm.sched.macro_fusion_pair_p (insn->rtl (),
+                                                use_insn->insn ()->rtl ()))
+       continue;
+
+      auto attempt = crtl->ssa->new_change_attempt ();
+      rtl_ssa::insn_change change (use_insn->insn ());
+
+      if (use_insn->insn () != insn->next_any_insn ())
+       {
+         if (!can_move_insn_p (use_insn->insn ()))
+           continue;
+
+         change.move_range = insn;
+         if (!rtl_ssa::restrict_movement (change))
+           continue;
+
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           {
+             fprintf (dump_file, "Moved a single-use instruction:\n");
+             dump_insn_slim (dump_file, use_insn->insn ()->rtl ());
+             fprintf (dump_file, "right after its definition:\n");
+             dump_insn_slim (dump_file, insn->rtl ());
+           }
+       }
+
+      SCHED_GROUP_P (use_insn->insn ()->rtl ()) = 1;
+      confirm_change_group ();
+      crtl->ssa->change_insn (change);
+    }
+
+  // Finalization.
+  if (crtl->ssa->perform_pending_updates ())
+    cleanup_cfg (0);
+
+  delete crtl->ssa;
+
+  init_recog ();
+  free_dominance_info (CDI_DOMINATORS);
+  return 0;
+}
+
+} // end namespace
+
+// Create a new dep fusion pass instance.
+
+rtl_opt_pass *
+make_pass_dep_fusion (gcc::context *ctxt)
+{
+  return new pass_dep_fusion (ctxt);
+}
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 09802303254..6be413ac869 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -582,8 +582,8 @@ Objective-C and Objective-C++ Dialects}.
 -fcse-follow-jumps  -fcse-skip-blocks  -fcx-fortran-rules
 -fcx-limited-range -fcx-method
 -fdata-sections  -fdce  -fdelayed-branch
--fdelete-null-pointer-checks  -fdevirtualize  -fdevirtualize-speculatively
--fdevirtualize-at-ltrans  -fdse
+-fdelete-null-pointer-checks  -fdep-fusion -fdevirtualize
+-fdevirtualize-speculatively  -fdevirtualize-at-ltrans  -fdse
 -fearly-inlining  -fipa-sra  -fexpensive-optimizations  -ffat-lto-objects
 -ffast-math  -ffinite-math-only  -ffloat-store  -fexcess-precision=@var{style}
 -ffinite-loops
@@ -13020,7 +13020,7 @@ also turns on the following optimization flags:
 -fcode-hoisting
 -fcrossjumping
 -fcse-follow-jumps  -fcse-skip-blocks
--fdelete-null-pointer-checks
+-fdelete-null-pointer-checks -fdep-fusion
 -fdevirtualize  -fdevirtualize-speculatively
 -fexpensive-optimizations
 -ffinite-loops
@@ -15940,6 +15940,15 @@ more efficiently if they are adjacent to each other in 
the instruction flow.
 
 Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.
 
+@opindex fdep-fusion
+@item -fdep-fusion
+Detect macro-op fusible pairs consisting of single-use instructions and their
+uses, and place such pairs together in the instruction stream to increase
+fusion opportunities in hardware.  This pass is executed once before register
+allocation, and another time before register renaming.
+
+Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.
+
 @opindex ftracer
 @item -ftracer
 Perform tail duplication to enlarge superblock size.  This transformation
diff --git a/gcc/opts.cc b/gcc/opts.cc
index c21e66ba917..e7e8d1d4d96 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -636,6 +636,7 @@ static const struct default_options default_options_table[] 
=
     { OPT_LEVELS_2_PLUS, OPT_fcode_hoisting, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fcrossjumping, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fcse_follow_jumps, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_fdep_fusion, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fdevirtualize, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fdevirtualize_speculatively, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fexpensive_optimizations, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index d528a0477d9..b81c7c2be51 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -514,6 +514,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_sched);
       NEXT_PASS (pass_rtl_avoid_store_forwarding);
       NEXT_PASS (pass_early_remat);
+      NEXT_PASS (pass_dep_fusion);
       NEXT_PASS (pass_ira);
       NEXT_PASS (pass_reload);
       /* In the following, some passes are tied to 'pass_postreload' and others
@@ -535,6 +536,7 @@ along with GCC; see the file COPYING3.  If not see
          NEXT_PASS (pass_sched_fusion);
          NEXT_PASS (pass_peephole2);
          NEXT_PASS (pass_if_after_reload);
+         NEXT_PASS (pass_dep_fusion);
          NEXT_PASS (pass_regrename);
          NEXT_PASS (pass_fold_mem_offsets);
          NEXT_PASS (pass_cprop_hardreg);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 1c68a69350d..61cec52c624 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -625,6 +625,7 @@ extern rtl_opt_pass 
*make_pass_value_profile_transformations (gcc::context
                                                              *ctxt);
 extern rtl_opt_pass *make_pass_postreload_cse (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_late_combine (gcc::context *ctxt);
+extern rtl_opt_pass *make_pass_dep_fusion (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
-- 
2.43.0

Reply via email to