Hi, The attached patch tries to fix PR88833. For the following test-case: subroutine foo(x) real :: x(100) x = x + 10 end subroutine foo
Assembly with -O3 -march=armv8.2-a+sve: foo_: .LFB0: .cfi_startproc mov w2, 100 mov w3, w2 mov x1, 0 whilelo p0.s, wzr, w2 fmov z1.s, #1.0e+1 .p2align 3,,7 .L2: ld1w z0.s, p0/z, [x0, x1, lsl 2] fadd z0.s, z0.s, z1.s st1w z0.s, p0, [x0, x1, lsl 2] incw x1 whilelo p0.s, w1, w3 bne .L2 ret As we can see, it generates extra mov w3, w2. Instead it could have reused w2 in both whilelo's. expand produces: insn 7: reg:SI 97 = 100 insn 8: use (reg:SI 97) insn 22: reg:SI 105 = 100 insn 23: use (reg:SI 105) Both reg:SI 97 and reg:SI 105 have only single definitions (and also single use). cse2 then replaces 100 with reg:SI 97 in insn 22, which becomes: insn 22: reg:SI 105 = reg:SI 97. sched1 then reorders instructions, and insn 7 and insn 22 end up falling in same basic block Looking at reload dump: Choosing alt 3 in insn 7: (0) r (1) M {*movsi_aarch64} alt=0,overall=0,losers=0,rld_nregs=0 Choosing alt 0 in insn 2: (0) =r (1) r {*movdi_aarch64} alt=0,overall=0,losers=0,rld_nregs=0 Choosing alt 0 in insn 22: (0) =r (1) r {*movsi_aarch64} 1 Non-pseudo reload: reject+=2 1 Non input pseudo reload: reject++ Cycle danger: overall += LRA_MAX_REJECT alt=0,overall=609,losers=1,rld_nregs=1 which shows, it ends up taking extra register. The issue here is that cse2 pass is leaving opportunities for propagating register copies. To address this, the patch makes following changes to fwprop.c: (a) Add support for handling UNSPEC in propagate_rtx_1 in a similar manner to simplify_replace_fn_rtx. (b) Allow propagating def inside a loop if source of def is a register in forward_propagate_into. AFAIU, replacing register by another register shouldn't increase cost. (c) Integrate fwprop and fwprop_addr, and make fwprop_addr propagate register copies. With the patch, fwprop_addr propagates reg:SI 97 in insn 23 and deletes insn 22, which eliminates the redundant mov. Does this patch look OK ? Bootstrapped + tested on x86_64-unknown-linux-gnu and aarch64-linux-gnu. Cross-testing with SVE in progress. Thanks, Prathamesh
diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..93a1a10c9a6 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only && !REG_P (SET_SRC (def_set))) + return false; + + /* Allow propagating def inside loop only if source of def_set is + reg, since replacing reg by source reg shouldn't increase cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && !REG_P (SET_SRC (def_set))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1538,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1557,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1597,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1609,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1633,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }