On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> wrote: > > On Mon, 24 Jun 2019 at 19:51, Richard Sandiford > <richard.sandif...@arm.com> wrote: > > > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) > > > if (!def_set) > > > return false; > > > > > > + if (reg_prop_only > > > + && !REG_P (SET_SRC (def_set)) > > > + && !REG_P (SET_DEST (def_set))) > > > + return false; > > > > This should be: > > > > if (reg_prop_only > > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) > > return false; > > > > so that we return false if either operand isn't a register. > Oops, sorry about that -:( > > > > > + > > > + /* Allow propagations into a loop only for reg-to-reg copies, since > > > + replacing one register by another shouldn't increase the cost. */ > > > + > > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > > > + && !REG_P (SET_SRC (def_set)) > > > + && !REG_P (SET_DEST (def_set))) > > > + return false; > > > > Same here. > > > > OK with that change, thanks. > Thanks for the review, will make the changes and commit the patch > after re-testing. Hi, Testing the patch showed following failures on 32-bit x86:
Executed from: g++.target/i386/i386.exp g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra Executed from: gcc.target/i386/i386.exp gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t ]*\\%eax,[\\t ]*%eax 1 The failure of pr88152.C is also seen on x86_64. For pr66768.c, and pr90178.c, forwprop replaces register which is volatile and frame related respectively. To avoid that, the attached patch, makes a stronger constraint that src and dest should be a register and not have frame_related or volatil flags set, which is checked in usable_reg_p(). Which avoids the failures for both the cases. Does it look OK ? For g++.target/i386/pr88152.C, the issue is that after the patch, forwprop1 does following propagation (in f10) which wasn't done before: In insn 10, replacing (unspec:SI [ (reg:V2DF 91) ] UNSPEC_MOVMSK) with (unspec:SI [ (subreg:V2DF (reg:V2DI 90) 0) ] UNSPEC_MOVMSK) This later defeats combine because insn 9 gets deleted. Without patch, the following combination takes place: Trying 7 -> 9: 7: r90:V2DI=r89:V2DI>r93:V2DI REG_DEAD r93:V2DI REG_DEAD r89:V2DI 9: r91:V2DF=r90:V2DI#0 REG_DEAD r90:V2DI Successfully matched this instruction: (set (subreg:V2DI (reg:V2DF 91) 0) (gt:V2DI (reg:V2DI 89) (reg:V2DI 93))) allowing combination of insns 7 and 9 and then: Trying 6, 9 -> 10: 6: r89:V2DI=const_vector 9: r91:V2DF#0=r89:V2DI>r93:V2DI REG_DEAD r89:V2DI REG_DEAD r93:V2DI 10: r87:SI=unspec[r91:V2DF] 43 REG_DEAD r91:V2DF Successfully matched this instruction: (set (reg:SI 87) (unspec:SI [ (lt:V2DF (reg:V2DI 93) (const_vector:V2DI [ (const_int 0 [0]) repeated x2 ])) ] UNSPEC_MOVMSK)) allowing combination of insns 6, 9 and 10 original costs 4 + 8 + 4 = 16 replacement cost 12 deferring deletion of insn with uid = 9. deferring deletion of insn with uid = 6. which deletes insns 2, 3, 6, 7, 9. With patch, it fails to combine 7->10: Trying 7 -> 10: 7: r90:V2DI=r89:V2DI>r93:V2DI REG_DEAD r93:V2DI REG_DEAD r89:V2DI 10: r87:SI=unspec[r90:V2DI#0] 43 REG_DEAD r90:V2DI Failed to match this instruction: (set (reg:SI 87) (unspec:SI [ (subreg:V2DF (gt:V2DI (reg:V2DI 89) (reg:V2DI 93)) 0) ] UNSPEC_MOVMSK)) and subsequently 6, 7 -> 10 (attached combine dumps before and after patch). So IIUC, the issue is that the target does not have a pattern that can match the above insn ? I tried a simple workaround to "pessimize" the else condition in propagate_rtx_1 in patch, to require old_rtx and new_rtx have same rtx_code, which at least works for this test-case, but not sure if that's the correct approach. Could you suggest how to proceed ? Thanks, Prathamesh > > Thanks, > Prathamesh > > > > Richard
before.combine
Description: Binary data
after.combine
Description: Binary data
diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..fd4e4eb2816 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -448,6 +448,22 @@ enum { PR_OPTIMIZE_FOR_SPEED = 4 }; +/* Avoid propagating X if it is a reg with one of the below flags set. */ + +static bool +usable_reg_p (rtx x) +{ + if (!REG_P (x)) + return false; + + if (RTX_FLAG (x, frame_related)) + return false; + + else if (RTX_FLAG (x, volatil)) + return false; + + return true; +} /* Replace all occurrences of OLD in *PX with NEW and try to simplify the resulting expression. Replace *PX with a new RTL expression if an @@ -547,6 +563,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1434,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1459,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1476,19 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only + && (!usable_reg_p (SET_SRC (def_set)) + || !usable_reg_p (SET_DEST (def_set)))) + return false; + + /* Allow propagations into a loop only for reg-to-reg copies, since + replacing one register by another shouldn't increase the cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && (!usable_reg_p (SET_SRC (def_set)) + || !usable_reg_p (SET_DEST (def_set)))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1557,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1576,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1616,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1628,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1652,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }