Implements a machine reorg pass for aarch64/Falkor to handle prefetcher tag collision. This is strictly not part of the loop unroller but for Falkor, unrolling can make h/w prefetcher performing badly if there are too much tag collisions based on the discussions in https://gcc.gnu.org/ml/gcc/2017-10/msg00178.html.
gcc/ChangeLog: 2018-02-12 Kugan Vivekanandarajah <kug...@linaro.org> * config/aarch64/aarch64.c (iv_p): New. (strided_load_p): Likwise. (make_tag): Likesie. (get_load_info): Likewise. (aarch64_reorg): Likewise. (TARGET_MACHINE_DEPENDENT_REORG): Implement new target hook.
From 0cd4f5acb2117c739ba81bb4b8b71af499107812 Mon Sep 17 00:00:00 2001 From: Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> Date: Mon, 12 Feb 2018 10:44:53 +1100 Subject: [PATCH 4/4] reorg-for-tag-collision Change-Id: Ic6e42d54268c9112ec1c25de577ca92c1808eeff --- gcc/config/aarch64/aarch64.c | 353 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 1ce2a0c..48e7c54 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -71,6 +71,7 @@ #include "selftest.h" #include "selftest-rtl.h" #include "rtx-vector-builder.h" +#include "cfgrtl.h" /* This file should be included last. */ #include "target-def.h" @@ -17203,6 +17204,355 @@ aarch64_select_early_remat_modes (sbitmap modes) } } +static bool +iv_p (rtx reg, struct loop *loop) +{ + df_ref adef; + unsigned regno = REGNO (reg); + bool def_in_loop = false; + bool def_out_loop = false; + + if (GET_MODE_CLASS (GET_MODE (reg)) != MODE_INT) + return false; + + for (adef = DF_REG_DEF_CHAIN (regno); adef; adef = DF_REF_NEXT_REG (adef)) + { + if (!DF_REF_INSN_INFO (adef) + || !NONDEBUG_INSN_P (DF_REF_INSN (adef))) + continue; + + basic_block bb = DF_REF_BB (adef); + if (dominated_by_p (CDI_DOMINATORS, bb, loop->header) + && bb->loop_father == loop) + { + rtx_insn *insn = DF_REF_INSN (adef); + recog_memoized (insn); + rtx pat = PATTERN (insn); + if (GET_CODE (pat) != SET) + continue; + rtx x = SET_SRC (pat); + if (GET_CODE (x) == ZERO_EXTRACT + || GET_CODE (x) == ZERO_EXTEND + || GET_CODE (x) == SIGN_EXTEND) + x = XEXP (x, 0); + if (MEM_P (x)) + continue; + if (GET_CODE (x) == POST_INC + || GET_CODE (x) == POST_DEC + || GET_CODE (x) == PRE_INC + || GET_CODE (x) == PRE_DEC) + def_in_loop = true; + else if (BINARY_P (x)) + def_in_loop = true; + } + if (dominated_by_p (CDI_DOMINATORS, loop->header, bb)) + def_out_loop = true; + if (def_in_loop && def_out_loop) + return true; + } + return false; +} + +/* Return true if X is a strided load. */ + +static bool +strided_load_p (rtx x, + struct loop *loop, + bool *pre_post, + rtx *base, + rtx *offset) +{ + /* Loadded value is extended, get src. */ + if (GET_CODE (x) == ZERO_EXTRACT + || GET_CODE (x) == ZERO_EXTEND + || GET_CODE (x) == SIGN_EXTEND) + x = XEXP (x, 0); + + /* If it is not MEM_P, it is not lodade from mem. */ + if (!MEM_P (x)) + return false; + + /* Get the src of MEM_P. */ + x = XEXP (x, 0); + + /* If it is a post/pre increment, get the src. */ + if (GET_CODE (x) == POST_INC + || GET_CODE (x) == POST_DEC + || GET_CODE (x) == PRE_INC + || GET_CODE (x) == PRE_DEC) + { + x = XEXP (x, 0); + *pre_post = true; + } + + /* get base and offset depending on the type. */ + if (REG_P (x) + || UNARY_P (x)) + { + if (!REG_P (x)) + x = XEXP (x, 0); + if (REG_P (x) + && iv_p (x, loop)) + { + *base = x; + return true; + } + } + else if (BINARY_P (x)) + { + rtx reg1, reg2; + reg1 = XEXP (x, 0); + + if (REG_P (reg1) + && REGNO (reg1) == SP_REGNUM) + return false; + reg2 = XEXP (x, 1); + + if (REG_P (reg1) + && iv_p (reg1, loop)) + { + + *base = reg1; + *offset = reg2; + return true; + } + + if (REG_P (reg1) + && REG_P (reg2) + && iv_p (reg2, loop)) + { + *base = reg1; + *offset = reg2; + return true; + } + } + return false; +} + +static unsigned +make_tag (unsigned dest, unsigned base, unsigned offset) +{ + return (dest & 0xf) + | ((base & 0xf) << 4) + | ((offset & 0x3f) << 8); +} + + +/* Return true if X INSN is a strided load. */ + +static bool +get_load_info (rtx_insn *insn, + struct loop *loop, + bool *pre_post, + rtx *base, + rtx *dest, + rtx *offset) +{ + subrtx_var_iterator::array_type array; + if (!INSN_P (insn) || recog_memoized (insn) < 0) + return false; + rtx pat = PATTERN (insn); + switch (GET_CODE (pat)) + { + case PARALLEL: + { + for (int j = 0; j < XVECLEN (pat, 0); ++j) + { + rtx ex = XVECEXP (pat, 0, j); + FOR_EACH_SUBRTX_VAR (iter, array, ex, NONCONST) + { + const_rtx x = *iter; + if (GET_CODE (x) == SET + && strided_load_p (SET_SRC (x), loop, pre_post, + base, offset)) + { + *dest = SET_DEST (x); + return true; + } + } + } + } + break; + + case SET: + FOR_EACH_SUBRTX_VAR (iter, array, SET_SRC (pat), NONCONST) + { + rtx x = *iter; + if (strided_load_p (x, loop, pre_post, + base, offset)) + { + *dest = SET_DEST (pat); + return true; + } + } + + default: + break; + } + return false; +} + +static void +aarch64_reorg (void) +{ + basic_block *body, bb; + struct loop *loop; + rtx_insn *insn; + + if (aarch64_tune != falkor) + return; + + compute_bb_for_insn (); + /* Compute live regs. */ + df_compute_regs_ever_live (true); + df_analyze (); + + /* Find the loops. */ + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + calculate_dominance_info (CDI_DOMINATORS); + FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) + { + hash_map <rtx, auto_vec<rtx_insn *> > tag_map (512); + body = get_loop_body (loop); + auto_vec <rtx> tags; + + /* Record all the memory tags. */ + for (unsigned i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + FOR_BB_INSNS (bb, insn) + { + unsigned tag; + rtx base = NULL_RTX; + rtx dest = NULL_RTX; + rtx offset = NULL_RTX; + bool pre_or_post = false; + + if (!INSN_P (insn) + || DEBUG_INSN_P (insn)) + continue; + + if (get_load_info (insn, loop, &pre_or_post, + &base, &dest, &offset)) + { + int int_offset = 0; + if (offset && REG_P (offset)) + int_offset = (1 << 5) | REGNO (offset); + else if (offset && CONST_INT_P (offset)) + { + int_offset = INTVAL (offset); + int_offset /= GET_MODE_SIZE (GET_MODE (dest)).to_constant (); + if (!pre_or_post) + int_offset >>= 2; + } + tag = make_tag (REGNO (dest), REGNO (base), int_offset); + rtx t = GEN_INT (tag); + if (!tag_map.get (t)) + tags.safe_push (t); + tag_map.get_or_insert (t).safe_push (insn); + } + } + } + + for (unsigned i = 0; i < tags.length (); ++i) + { + rtx t = tags[i]; + auto_vec<rtx_insn *> *v = tag_map.get (t); + + for (int j = v->length () - 1; j > 0; --j) + { + /* Get the insns that has tags colliding. */ + rtx_insn *insn = (*v)[j]; + rtx pat; + bool changed = false; + int int_offset = 0; + rtx base = NULL_RTX; + rtx dest = NULL_RTX; + rtx offset = NULL_RTX; + bool pre_or_post = false; + + if (!get_load_info (insn, loop, &pre_or_post, + &base, &dest, &offset)) + gcc_assert (false); + + if (offset && REG_P (offset)) + int_offset = (1 << 5) | REGNO (offset); + else if (offset && CONST_INT_P (offset)) + { + int_offset = INTVAL (offset); + int_offset /= GET_MODE_SIZE (GET_MODE (dest)).to_constant (); + if (!pre_or_post) + int_offset >>= 2; + } + + /* Go over temporary registers and find a free register, if + available. */ + for (int k = R9_REGNUM; !changed && (k <= R15_REGNUM); k++) + if (!df_hard_reg_used_p (k)) + { + unsigned tag; + rtx t; + + tag = make_tag (REGNO (dest), k, int_offset); + t = GEN_INT (tag); + /* Check to see if the new tag also collides with an + existing load. */ + if (tag_map.get (t)) + continue; + + machine_mode mode = GET_MODE (base); + rtx new_reg = gen_rtx_REG (mode, k); + t = GEN_INT (make_tag (REGNO (dest), REGNO (new_reg), + int_offset)); + vec <rtx_insn *> *v2 = tag_map.get (t); + if (v2 && (v2->length () > 0)) + continue; + + /* Change the insn: dest = load (base, offset) + into tmp = base; dest = load (tmp, offset). */ + extract_insn (insn); + for (int l = 0; + (!changed) && (l < recog_data.n_operands); l++) + { + subrtx_ptr_iterator::array_type array; + rtx *op = recog_data.operand_loc[l]; + + if (recog_data.operand_type[l] == OP_OUT) + continue; + + FOR_EACH_SUBRTX_PTR (iter, array, op, NONCONST) + { + rtx *loc = *iter; + rtx x = *loc; + + if (!changed && (base == x)) + { + pat = gen_rtx_SET (new_reg, base); + if (validate_change (insn, loc, new_reg, false)) + { + emit_insn_before (pat, insn); + if (pre_or_post) + { + rtx pat2 = gen_rtx_SET (base, new_reg); + emit_insn_after (pat2, insn); + } + } + v->pop (); + tag_map.get_or_insert (t).safe_push (insn); + changed = true; + break; + } + } + } + } + } + } + } + + loop_optimizer_finalize (); + df_finish_pass (true); +} + /* Target-specific selftests. */ #if CHECKING_P @@ -17675,6 +18025,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_HW_MAX_MEM_READ_STREAMS #define TARGET_HW_MAX_MEM_READ_STREAMS aarch64_hw_max_mem_read_streams +#undef TARGET_MACHINE_DEPENDENT_REORG +#define TARGET_MACHINE_DEPENDENT_REORG aarch64_reorg + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests -- 2.7.4