This patch implements the TARGET_SCHED_FUSION_PRIORITY hook for the Synopsys
RHX-100 processor to improve instruction scheduling by prioritizing fusible
memory operations.
To take better advantage of double load/store fusion, make use of the
sched_fusion pass that assigns unique "fusion priorities" to load/store
instructions and schedules operations on adjacent addresses together. This
maximizes the probability that loads/stores are fused between each other
rather than with other instructions.
gcc/ChangeLog:
* config/riscv/arcv.cc (arcv_fusion_load_store): New function.
(arcv_sched_fusion_priority): New function.
* config/riscv/riscv.cc (riscv_sched_fusion_priority): New function.
(TARGET_SCHED_FUSION_PRIORITY): Define hook.
Co-authored-by: Artemiy Volkov <[email protected]>
Co-authored-by: Michiel Derhaeg <[email protected]>
Signed-off-by: Luis Silva <[email protected]>
---
gcc/config/riscv/arcv.cc | 101 ++++++++++++++++++++++++++++++++
gcc/config/riscv/riscv-protos.h | 1 +
gcc/config/riscv/riscv.cc | 17 ++++++
3 files changed, 119 insertions(+)
diff --git a/gcc/config/riscv/arcv.cc b/gcc/config/riscv/arcv.cc
index f5d5fb6cd22..af237660226 100644
--- a/gcc/config/riscv/arcv.cc
+++ b/gcc/config/riscv/arcv.cc
@@ -497,3 +497,104 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
return false;
}
+
+/* If INSN is a load or store of address in the form of [base+offset],
+ extract the two parts and set to BASE and OFFSET. IS_LOAD is set
+ to TRUE if it's a load. Return TRUE if INSN is such an instruction,
+ otherwise return FALSE. */
+
+static bool
+arcv_fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset,
+ machine_mode *mode, bool *is_load)
+{
+ rtx x, dest, src;
+
+ gcc_assert (INSN_P (insn));
+ x = PATTERN (insn);
+ if (GET_CODE (x) != SET)
+ return false;
+
+ src = SET_SRC (x);
+ dest = SET_DEST (x);
+
+ if ((GET_CODE (src) == SIGN_EXTEND || GET_CODE (src) == ZERO_EXTEND)
+ && MEM_P (XEXP (src, 0)))
+ src = XEXP (src, 0);
+
+ if (REG_P (src) && MEM_P (dest))
+ {
+ *is_load = false;
+ if (extract_base_offset_in_addr (dest, base, offset))
+ *mode = GET_MODE (dest);
+ }
+ else if (MEM_P (src) && REG_P (dest))
+ {
+ *is_load = true;
+ if (extract_base_offset_in_addr (src, base, offset))
+ *mode = GET_MODE (src);
+ }
+ else
+ return false;
+
+ return (*base != NULL_RTX && *offset != NULL_RTX);
+}
+
+void
+arcv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri,
+ int *pri)
+{
+ rtx base, offset;
+ machine_mode mode = SImode;
+ bool is_load;
+
+ gcc_assert (INSN_P (insn));
+
+ /* Default priority for non-fusible instructions. */
+ int default_pri = max_pri - 1;
+
+ /* Check if this is a fusible load/store instruction. */
+ if (!arcv_fusion_load_store (insn, &base, &offset, &mode, &is_load)
+ || !arcv_pair_fusion_mode_allowed_p (mode, is_load))
+ {
+ *pri = default_pri;
+ *fusion_pri = default_pri;
+ return;
+ }
+
+ /* Start with half the default priority to distinguish fusible from
+ non-fusible instructions. */
+ int priority = default_pri / 2;
+
+ /* Scale priority by access width - narrower accesses get lower priority.
+ HImode: divide by 2, QImode: divide by 4. This encourages wider
+ accesses to be scheduled together. */
+ if (mode == HImode)
+ priority /= 2;
+ else if (mode == QImode)
+ priority /= 4;
+
+ /* Factor in base register: instructions with smaller register numbers
+ get higher priority. The shift by 20 bits ensures this is the most
+ significant component of the priority. */
+ const int BASE_REG_SHIFT = 20;
+ const int BASE_REG_MASK = 0xff;
+ priority -= ((REGNO (base) & BASE_REG_MASK) << BASE_REG_SHIFT);
+
+ /* Calculate fusion priority: group loads/stores with adjacent addresses
+ into the same scheduling group. We divide the offset by (mode_size * 2)
+ to group pairs of adjacent accesses, then shift left by 1 to make room
+ for the load/store bit. */
+ int off_val = (int)(INTVAL (offset));
+ int addr_group = off_val / (GET_MODE_SIZE (mode).to_constant () * 2);
+ *fusion_pri = priority - (addr_group << 1) + is_load;
+
+ /* Factor in the actual offset value: instructions with smaller offsets
+ get higher priority. We use only the lower 20 bits to avoid overflow. */
+ const int OFFSET_MASK = 0xfffff;
+ if (off_val >= 0)
+ priority -= (off_val & OFFSET_MASK);
+ else
+ priority += ((-off_val) & OFFSET_MASK);
+
+ *pri = priority;
+}
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 2dcf8a4d697..c3157c084d1 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -841,6 +841,7 @@ extern bool th_print_operand_address (FILE *, machine_mode,
rtx);
/* Routines implemented in arcv.cc. */
extern bool arcv_macro_fusion_pair_p (rtx_insn *, rtx_insn *);
+extern void arcv_sched_fusion_priority (rtx_insn *, int, int *, int *);
extern bool strided_load_broadcast_p (void);
extern bool riscv_prefer_agnostic_p (void);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 9c0b55eca0d..fabd381e159 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -11790,6 +11790,21 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn
*curr)
return false;
}
+static void
+riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri,
+ int *pri)
+{
+ if (TARGET_ARCV_RHX100)
+ {
+ arcv_sched_fusion_priority (insn, max_pri, fusion_pri, pri);
+ return;
+ }
+
+ /* Default priority. */
+ *pri = max_pri - 1;
+ *fusion_pri = max_pri - 1;
+}
+
/* Adjust the cost/latency of instructions for scheduling.
For now this is just used to change the latency of vector instructions
according to their LMUL. We assume that an insn with LMUL == 8 requires
@@ -16444,6 +16459,8 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode
mode)
#define TARGET_SCHED_MACRO_FUSION_P riscv_macro_fusion_p
#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
#define TARGET_SCHED_MACRO_FUSION_PAIR_P riscv_macro_fusion_pair_p
+#undef TARGET_SCHED_FUSION_PRIORITY
+#define TARGET_SCHED_FUSION_PRIORITY riscv_sched_fusion_priority
#undef TARGET_SCHED_INIT
#define TARGET_SCHED_INIT riscv_sched_init
--
2.34.0