Version v2 uses movsi/di for GOT accesses until after reload as suggested. This
caused worse spilling, however improving the costs of GOT accesses resulted in
better codesize and performance gains:
Improve GOT addressing by treating the instructions as a pair. This reduces
register pressure and improves code quality significantly. SPECINT2017 improves
by 0.30% with -fPIC and codesize is 0.7% smaller. Perlbench has 0.9% smaller
codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1.
Passes bootstrap and regress. OK for commit?
ChangeLog:
2021-05-21 Wilco Dijkstra
* config/aarch64/aarch64.md (movsi): Split GOT accesses after reload.
(movdi): Likewise.
* config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay
splitting of GOT accesses until after reload.
(aarch64_rtx_costs): Set rematerialization cost for GOT accesses.
(aarch64_macro_fusion_pair_p): Fuse GOT accesses.
---
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index
641c83b479e76cbcc75b299eb7ae5f634d9db7cd..75b3caa94dd8a52342bbddbfcb73ab06a7418907
100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
case SYMBOL_SMALL_GOT_4G:
{
+ /* Don't split into ADRP/LDR until after reload - this improves
+ CSE and rematerialization of GOT accesses. */
+ if (!reload_completed)
+ {
+ emit_insn (gen_rtx_SET (dest, imm));
+ return;
+ }
+
/* In ILP32, the mode of dest can be either SImode or DImode,
while the got entry is always of SImode size. The mode of
dest depends on how dest is used: if dest is assigned to a
@@ -13460,6 +13468,14 @@ cost_plus:
*cost += COSTS_N_INSNS (1);
if (speed)
*cost += 2 * extra_cost->alu.arith;
+
+ /* Set a low remateralization cost for GOT accesses - this blocks
+them from being spilled and reduces register pressure. */
+ if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC
+ && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G)
+ *cost = COSTS_N_INSNS (1) / 2;
+
+ return true;
}
else if (aarch64_cmodel == AARCH64_CMODEL_TINY
|| aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
@@ -19930,6 +19946,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
return aarch64_simd_valid_immediate (x, NULL);
}
+ /* GOT accesses are split after regalloc. */
+ if (SYMBOL_REF_P (x)
+ && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
+return true;
+
x = strip_salt (x);
if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
return true;
@@ -23746,6 +23767,24 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn
*curr)
}
}
+ /* Always treat GOT accesses as a pair to ensure they can be easily
+ identified and optimized in linkers. */
+ if (simple_sets_p)
+{
+ /* We're trying to match:
+ prev (adrp) == (set (reg r1) (high (symbol_ref ("SYM"
+ curr (add) == (set (reg r0)
+ (unspec [(mem (lo_sum (reg r1) (symbol_ref ("SYM"]
+UNSPEC_GOTSMALLPIC)) */
+
+ if (satisfies_constraint_Ush (SET_SRC (prev_set))
+ && REG_P (SET_DEST (prev_set))
+ && REG_P (SET_DEST (curr_set))
+ && GET_CODE (SET_SRC (curr_set)) == UNSPEC
+ && XINT (SET_SRC (curr_set), 1) == UNSPEC_GOTSMALLPIC)
+ return true;
+}
+
if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
{
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index
abfd84526745d029ad4953eabad6dd17b159a218..2527c96576a78f2071da20721143a27adeb1551b
100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
fmov\\t%w0, %s1
fmov\\t%s0, %s1
* return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
- "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]),
SImode)
-&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+ "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]),
SImode)
+&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+|| (reload_completed
+ && (aarch64_classify_symbolic_expression (operands[1])
+ == SYMBOL_SMALL_GOT_4G))"
[(const_int 0)]
"{
aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
fmov\\t%x0, %d1
fmov\\t%d0, %d1
* return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
- "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]),
DImode))
-&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]