Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> ping
>
>
> From: Wilco Dijkstra
> Sent: 04 June 2021 14:44
> To: Richard Sandiford <richard.sandif...@arm.com>
> Cc: Kyrylo Tkachov <kyrylo.tkac...@arm.com>; GCC Patches 
> <gcc-patches@gcc.gnu.org>
> Subject: [PATCH v3] AArch64: Improve GOT addressing
>
> Hi Richard,
>
> This merges the v1 and v2 patches and removes the spurious MEM from
> ldr_got_small_si/di. This has been rebased after [1], and the performance
> gain has now doubled.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/571708.html
>
> Improve GOT addressing by treating the instructions as a pair.  This reduces
> register pressure and improves code quality significantly.  SPECINT2017 
> improves
> by 0.6% with -fPIC and codesize is 0.73% smaller.  Perlbench has 0.9% smaller
> codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1.
>
> Passes bootstrap and regress. OK for commit?

Looks like everyone agrees that
https://github.com/ARM-software/abi-aa/pull/106 should go in in some form,
so I think it's OK for GCC to keep the instructions together.  Some comments
on the implementation though.  (We might have covered these earlier,
sorry if so.)

- Why do we rewrite the constant moves after reload into ldr_got_small_sidi
  and ldr_got_small_<mode>?  Couldn't we just get the move patterns to
  output the sequence directly?

- I think we should leave out the rtx_costs part and deal with that
  separately.  This patch should just be about whether we emit two
  separate define_insns for the move or whether we keep a single one
  (to support relaxation).

Thanks,
Richard

>
> ChangeLog:
> 2021-06-04  Wilco Dijkstra  <wdijk...@arm.com>
>
>         * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload.
>         (movdi): Likewise.
>         (ldr_got_small_<mode>): Remove MEM and LO_SUM, emit ADRP+LDR GOT 
> sequence.
>         (ldr_got_small_sidi): Likewise.
>         * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay
>         splitting of GOT accesses until after reload. Remove tmp_reg and MEM.
>         (aarch64_print_operand): Correctly print got_lo12 in L specifier.
>         (aarch64_rtx_costs): Set rematerialization cost for GOT accesses.
>         (aarch64_mov_operand_p): Make GOT accesses valid move operands.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 08245827daa3f8199b29031e754244c078f0f500..11ea33c70fb06194fadfe94322fdfa098e5320fc
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
>
>      case SYMBOL_SMALL_GOT_4G:
>        {
> +       /* Use movdi for GOT accesses until after reload - this improves
> +          CSE and rematerialization.  */
> +       if (!reload_completed)
> +         {
> +           emit_insn (gen_rtx_SET (dest, imm));
> +           return;
> +         }
> +
>          /* In ILP32, the mode of dest can be either SImode or DImode,
>             while the got entry is always of SImode size.  The mode of
>             dest depends on how dest is used: if dest is assigned to a
> @@ -3624,34 +3632,21 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
>             patterns here (two patterns for ILP32).  */
>
>          rtx insn;
> -       rtx mem;
> -       rtx tmp_reg = dest;
>          machine_mode mode = GET_MODE (dest);
>
> -       if (can_create_pseudo_p ())
> -         tmp_reg = gen_reg_rtx (mode);
> -
> -       emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
>          if (mode == ptr_mode)
>            {
>              if (mode == DImode)
> -             insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
> +             insn = gen_ldr_got_small_di (dest, imm);
>              else
> -             insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
> -
> -           mem = XVECEXP (SET_SRC (insn), 0, 0);
> +             insn = gen_ldr_got_small_si (dest, imm);
>            }
>          else
>            {
>              gcc_assert (mode == Pmode);
> -
> -           insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
> -           mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
> +           insn = gen_ldr_got_small_sidi (dest, imm);
>            }
>
> -       gcc_assert (MEM_P (mem));
> -       MEM_READONLY_P (mem) = 1;
> -       MEM_NOTRAP_P (mem) = 1;
>          emit_insn (insn);
>          return;
>        }
> @@ -11019,7 +11014,7 @@ aarch64_print_operand (FILE *f, rtx x, int code)
>        switch (aarch64_classify_symbolic_expression (x))
>          {
>          case SYMBOL_SMALL_GOT_4G:
> -         asm_fprintf (asm_out_file, ":lo12:");
> +         asm_fprintf (asm_out_file, ":got_lo12:");
>            break;
>
>          case SYMBOL_SMALL_TLSGD:
> @@ -13452,6 +13447,12 @@ cost_plus:
>
>      case SYMBOL_REF:
>        *cost = 0;
> +
> +      /* Use a separate remateralization cost for GOT accesses.  */
> +      if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC
> +         && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G)
> +       *cost = COSTS_N_INSNS (1) / 2;
> +
>        return true;
>
>      case HIGH:
> @@ -19907,6 +19908,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
>        return aarch64_simd_valid_immediate (x, NULL);
>      }
>
> +  /* GOT accesses are valid moves until after regalloc.  */
> +  if (SYMBOL_REF_P (x)
> +      && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
> +    return true;
> +
>    x = strip_salt (x);
>    if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
>      return true;
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> abfd84526745d029ad4953eabad6dd17b159a218..30effca6f3562f6870a6cc8097750e63bb0d424d
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
>     fmov\\t%w0, %s1
>     fmov\\t%s0, %s1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
> -  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
> SImode)
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +  "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
> SImode)
> +    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
> +    || (reload_completed
> +       && (aarch64_classify_symbolic_expression (operands[1])
> +           == SYMBOL_SMALL_GOT_4G))"
>     [(const_int 0)]
>     "{
>         aarch64_expand_mov_immediate (operands[0], operands[1]);
> @@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
>     fmov\\t%x0, %d1
>     fmov\\t%d0, %d1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
> -   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
> DImode))
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
> DImode)
> +    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
> +    || (reload_completed
> +       && (aarch64_classify_symbolic_expression (operands[1])
> +           == SYMBOL_SMALL_GOT_4G))"
>     [(const_int 0)]
>     "{
>         aarch64_expand_mov_immediate (operands[0], operands[1]);
> @@ -6703,27 +6709,26 @@ (define_insn "add_losym_<mode>"
>    [(set_attr "type" "alu_imm")]
>  )
>
> +;;; GOT accesses use a single insn so linkers can relax GOT relocations.
>  (define_insn "ldr_got_small_<mode>"
>    [(set (match_operand:PTR 0 "register_operand" "=r")
> -       (unspec:PTR [(mem:PTR (lo_sum:PTR
> -                             (match_operand:PTR 1 "register_operand" "r")
> -                             (match_operand:PTR 2 "aarch64_valid_symref" 
> "S")))]
> +       (unspec:PTR [(match_operand:PTR 1 "aarch64_valid_symref" "S")]
>                      UNSPEC_GOTSMALLPIC))]
>    ""
> -  "ldr\\t%<w>0, [%1, #:got_lo12:%c2]"
> -  [(set_attr "type" "load_<ldst_sz>")]
> +  "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]"
> +  [(set_attr "type" "load_<ldst_sz>")
> +   (set_attr "length" "8")]
>  )
>
>  (define_insn "ldr_got_small_sidi"
>    [(set (match_operand:DI 0 "register_operand" "=r")
>          (zero_extend:DI
> -        (unspec:SI [(mem:SI (lo_sum:DI
> -                            (match_operand:DI 1 "register_operand" "r")
> -                            (match_operand:DI 2 "aarch64_valid_symref" 
> "S")))]
> +        (unspec:SI [(match_operand:DI 1 "aarch64_valid_symref" "S")]
>                      UNSPEC_GOTSMALLPIC)))]
>    "TARGET_ILP32"
> -  "ldr\\t%w0, [%1, #:got_lo12:%c2]"
> -  [(set_attr "type" "load_4")]
> +  "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]"
> +  [(set_attr "type" "load_4")
> +   (set_attr "length" "8")]
>  )
>
>  (define_insn "ldr_got_small_28k_<mode>"

Reply via email to