From: Artemiy Volkov <arte...@synopsys.com> Presently, the scheduler code only considers consecutive instructions for macro-op fusion (see sched-deps.cc::sched_macro_fuse_insns () for details). This patch introduces the new dep_fusion pass, which is intended to uncover more fusion opportunities by reordering eligible instructions to form fusible pairs (based solely on the value of the TARGET_SCHED_MACRO_FUSION_PAIR_P hook). This is achieved by using the RTL-SSA framework, and only the single-use instructions are considered for the first instruction of a pair.
Aside from reordering instructions, this pass also sets the SCHED_GROUP flag for the second instruction so that following passes can implement special handling of the fused pairs. For instance, RA and regrename should make use of this information to preserve single-output property for some of such pairs. Accordingly, in passes.def, this patch adds two invocations of the new pass: just before IRA and just before regrename. The new pass is enabled at -O2+ and -Os. gcc/ChangeLog: * Makefile.in (OBJS): Add dep-fusion.o. * common.opt (fdep-fusion): Add option. * dep-fusion.cc: New pass. * doc/invoke.texi: Document it. * opts.cc (default_options_table): Enable it at -O2+ and -Os. * passes.def: Insert two instances of dep_fusion. * tree-pass.h (make_pass_dep_fusion): Declare new function. Suggested-by: Jeff Law <j...@ventanamicro.com> Signed-off-by: Artemiy Volkov <artem...@acm.org> --- gcc/Makefile.in | 1 + gcc/common.opt | 4 ++ gcc/dep-fusion.cc | 148 ++++++++++++++++++++++++++++++++++++++++++++ gcc/doc/invoke.texi | 15 ++++- gcc/opts.cc | 1 + gcc/passes.def | 2 + gcc/tree-pass.h | 1 + 7 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 gcc/dep-fusion.cc diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 7314a3b4225..fd6c08e826e 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1450,6 +1450,7 @@ OBJS = \ dce.o \ ddg.o \ debug.o \ + dep-fusion.o \ df-core.o \ df-problems.o \ df-scan.o \ diff --git a/gcc/common.opt b/gcc/common.opt index 70659fabebd..2440c72c135 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1392,6 +1392,10 @@ fdelete-null-pointer-checks Common Var(flag_delete_null_pointer_checks) Init(-1) Optimization Delete useless null pointer checks. +fdep-fusion +Common Var(flag_dep_fusion) Optimization Init(1) +Issue defining instructions back to back with their single uses, provided they are macro-fusible in the target microarchitecture. + fdevirtualize-at-ltrans Common Var(flag_ltrans_devirtualize) Stream extra data to support more aggressive devirtualization in LTO local transformation mode. diff --git a/gcc/dep-fusion.cc b/gcc/dep-fusion.cc new file mode 100644 index 00000000000..1e69e68dd87 --- /dev/null +++ b/gcc/dep-fusion.cc @@ -0,0 +1,148 @@ +// Dependency fusion reordering pass. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. +// +// This pass uses the RTL-SSA representation to detect def-use pairs that are +// macro-op-fusible in the current microarchitecture (using the +// macro_fusion_pair_p () target hook) and place them next to one another, if +// possible. + +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY +#define INCLUDE_ARRAY +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "df.h" +#include "rtl-ssa.h" +#include "print-rtl.h" +#include "tree-pass.h" +#include "cfgcleanup.h" +#include "target.h" +#include "dbgcnt.h" + +namespace { +const pass_data pass_data_dep_fusion = +{ + RTL_PASS, // type + "dep_fusion", // name + OPTGROUP_NONE, // optinfo_flags + TV_NONE, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish, // todo_flags_finish +}; + +class pass_dep_fusion : public rtl_opt_pass +{ +public: + pass_dep_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_dep_fusion, ctxt) + {} + + // opt_pass methods: + opt_pass *clone () override { return new pass_dep_fusion (m_ctxt); } + bool gate (function *) override; + unsigned int execute (function *) override; +}; + +bool +pass_dep_fusion::gate (function *) +{ + return optimize > 0 && flag_dep_fusion; +} + +unsigned int +pass_dep_fusion::execute (function *fn) +{ + // Initialization. + calculate_dominance_info (CDI_DOMINATORS); + df_analyze (); + crtl->ssa = new rtl_ssa::function_info (fn); + + init_recog_no_volatile (); + + for (rtl_ssa::insn_info *insn = *crtl->ssa->nondebug_insns ().begin (); + insn; + insn = insn->next_nondebug_insn ()) + { + if (!insn->can_be_optimized () || insn->num_defs () != 1) + continue; + + rtl_ssa::set_info *def = single_set_info (insn); + if (!def) + continue; + + rtl_ssa::use_info *use_insn = def->single_nondebug_insn_use (); + if (!use_insn + || !use_insn->insn ()->can_be_optimized () + || !targetm.sched.macro_fusion_pair_p (insn->rtl (), + use_insn->insn ()->rtl ())) + continue; + + auto attempt = crtl->ssa->new_change_attempt (); + rtl_ssa::insn_change change (use_insn->insn ()); + + if (use_insn->insn () != insn->next_any_insn ()) + { + if (!can_move_insn_p (use_insn->insn ())) + continue; + + change.move_range = insn; + if (!rtl_ssa::restrict_movement (change)) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Moved a single-use instruction:\n"); + dump_insn_slim (dump_file, use_insn->insn ()->rtl ()); + fprintf (dump_file, "right after its definition:\n"); + dump_insn_slim (dump_file, insn->rtl ()); + } + } + + SCHED_GROUP_P (use_insn->insn ()->rtl ()) = 1; + confirm_change_group (); + crtl->ssa->change_insn (change); + } + + // Finalization. + if (crtl->ssa->perform_pending_updates ()) + cleanup_cfg (0); + + delete crtl->ssa; + + init_recog (); + free_dominance_info (CDI_DOMINATORS); + return 0; +} + +} // end namespace + +// Create a new dep fusion pass instance. + +rtl_opt_pass * +make_pass_dep_fusion (gcc::context *ctxt) +{ + return new pass_dep_fusion (ctxt); +} diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 09802303254..6be413ac869 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -582,8 +582,8 @@ Objective-C and Objective-C++ Dialects}. -fcse-follow-jumps -fcse-skip-blocks -fcx-fortran-rules -fcx-limited-range -fcx-method -fdata-sections -fdce -fdelayed-branch --fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively --fdevirtualize-at-ltrans -fdse +-fdelete-null-pointer-checks -fdep-fusion -fdevirtualize +-fdevirtualize-speculatively -fdevirtualize-at-ltrans -fdse -fearly-inlining -fipa-sra -fexpensive-optimizations -ffat-lto-objects -ffast-math -ffinite-math-only -ffloat-store -fexcess-precision=@var{style} -ffinite-loops @@ -13020,7 +13020,7 @@ also turns on the following optimization flags: -fcode-hoisting -fcrossjumping -fcse-follow-jumps -fcse-skip-blocks --fdelete-null-pointer-checks +-fdelete-null-pointer-checks -fdep-fusion -fdevirtualize -fdevirtualize-speculatively -fexpensive-optimizations -ffinite-loops @@ -15940,6 +15940,15 @@ more efficiently if they are adjacent to each other in the instruction flow. Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. +@opindex fdep-fusion +@item -fdep-fusion +Detect macro-op fusible pairs consisting of single-use instructions and their +uses, and place such pairs together in the instruction stream to increase +fusion opportunities in hardware. This pass is executed once before register +allocation, and another time before register renaming. + +Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. + @opindex ftracer @item -ftracer Perform tail duplication to enlarge superblock size. This transformation diff --git a/gcc/opts.cc b/gcc/opts.cc index c21e66ba917..e7e8d1d4d96 100644 --- a/gcc/opts.cc +++ b/gcc/opts.cc @@ -636,6 +636,7 @@ static const struct default_options default_options_table[] = { OPT_LEVELS_2_PLUS, OPT_fcode_hoisting, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fcrossjumping, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fcse_follow_jumps, NULL, 1 }, + { OPT_LEVELS_2_PLUS, OPT_fdep_fusion, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fdevirtualize, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fdevirtualize_speculatively, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fexpensive_optimizations, NULL, 1 }, diff --git a/gcc/passes.def b/gcc/passes.def index d528a0477d9..b81c7c2be51 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -514,6 +514,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sched); NEXT_PASS (pass_rtl_avoid_store_forwarding); NEXT_PASS (pass_early_remat); + NEXT_PASS (pass_dep_fusion); NEXT_PASS (pass_ira); NEXT_PASS (pass_reload); /* In the following, some passes are tied to 'pass_postreload' and others @@ -535,6 +536,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sched_fusion); NEXT_PASS (pass_peephole2); NEXT_PASS (pass_if_after_reload); + NEXT_PASS (pass_dep_fusion); NEXT_PASS (pass_regrename); NEXT_PASS (pass_fold_mem_offsets); NEXT_PASS (pass_cprop_hardreg); diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 1c68a69350d..61cec52c624 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -625,6 +625,7 @@ extern rtl_opt_pass *make_pass_value_profile_transformations (gcc::context *ctxt); extern rtl_opt_pass *make_pass_postreload_cse (gcc::context *ctxt); extern rtl_opt_pass *make_pass_late_combine (gcc::context *ctxt); +extern rtl_opt_pass *make_pass_dep_fusion (gcc::context *ctxt); extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt); extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt); extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context -- 2.43.0