On Tue, 2 Jul 2019 at 18:22, Richard Sandiford <richard.sandif...@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford > > <richard.sandif...@arm.com> wrote: > >> > >> Thanks for fixing this. > >> > >> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > >> > index 89a46a933fa..79bd0cfbd28 100644 > >> > --- a/gcc/simplify-rtx.c > >> > +++ b/gcc/simplify-rtx.c > >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, > >> > } > >> > } > >> > > >> > + /* If op is a vector comparison operator, rewrite it in a new mode. > >> > + This probably won't match, but may allow further simplifications. > >> > + Also check if number of elements and size of each element > >> > + match for outermode and innermode. */ > >> > + > >> > >> Excess blank line after the comment. IMO the second part of the comment > >> reads too much like an afterthought. How about: > >> > >> /* If OP is a vector comparison and the subreg is not changing the > >> number of elements or the size of the elements, change the result > >> of the comparison to the new mode. */ > >> > >> > + if (COMPARISON_P (op) > >> > + && VECTOR_MODE_P (outermode) > >> > + && VECTOR_MODE_P (innermode) > >> > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS > >> > (innermode))) > >> > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), > >> > + GET_MODE_UNIT_SIZE (innermode)))) > >> > >> Redundant brackets around the known_eq calls. > >> > >> > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP > >> > (op, 1)); > >> > >> This should use simplify_gen_relational, so that we try to simplify > >> the new expression. > > Does the attached version look OK ? > > OK with a suitable changelog (remember to post those!) if it passes testing. The change to simplify_subreg regressed avx2-pr54700-1.C -;)
For following test-case: __attribute__((noipa)) __v8sf f7 (__v8si a, __v8sf b, __v8sf c) { return a < 0 ? b : c; } Before patch, combine shows: Trying 8, 9 -> 10: 8: r87:V8SI=const_vector 9: r89:V8SI=r87:V8SI>r90:V8SI REG_DEAD r90:V8SI REG_DEAD r87:V8SI 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 REG_DEAD r92:V8SF REG_DEAD r89:V8SI REG_DEAD r91:V8SF Successfully matched this instruction: (set (reg:V8SF 86) (unspec:V8SF [ (reg:V8SF 92) (reg:V8SF 91) (subreg:V8SF (lt:V8SI (reg:V8SI 90) (const_vector:V8SI [ (const_int 0 [0]) repeated x8 ])) 0) ] UNSPEC_BLENDV)) allowing combination of insns 8, 9 and 10 After applying patch, combine shows: Trying 8, 9 -> 10: 8: r87:V8SI=const_vector 9: r89:V8SI=r87:V8SI>r90:V8SI REG_DEAD r90:V8SI REG_DEAD r87:V8SI 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 REG_DEAD r92:V8SF REG_DEAD r89:V8SI REG_DEAD r91:V8SF Failed to match this instruction: (set (reg:V8SF 86) (unspec:V8SF [ (reg:V8SF 92) (reg:V8SF 91) (lt:V8SF (reg:V8SI 90) (const_vector:V8SI [ (const_int 0 [0]) repeated x8 ])) ] UNSPEC_BLENDV)) Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work. Does it look OK ? Testing the attached patch in progress. (A quick comparison for i386.exp now shows no difference before/after patch). Thanks, Prathamesh > > Thanks, > Richard
2019-07-03 Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> * fwprop.c (reg_single_def_p): New function. (propagate_rtx_1): Add unconditional else inside RTX_EXTRA case. (forward_propagate_into): New parameter reg_prop_only with default value false. Propagate def's src into loop only if SET_SRC and SET_DEST of def_set have single definitions. Likewise if reg_prop_only is set to true. (fwprop): New param fwprop_addr_p. Integrate fwprop_addr into fwprop. (fwprop_addr): Remove. (pass_rtl_fwprop_addr::execute): Call fwprop with arg set to true. (pass_rtl_fwprop::execute): Call fwprop with arg set to false. * simplify-rtx.c (simplify_subreg): Add case for vector comparison. * i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern. testsuite/ * gfortran.dg/pr88833.f90: New test. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d7d542524fb..5384fe218f9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -16623,10 +16623,9 @@ (unspec:VF_128_256 [(match_operand:VF_128_256 1 "register_operand" "0,0,x") (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") - (subreg:VF_128_256 - (lt:<sseintvecmode> + (lt:VF_128_256 (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") - (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))] UNSPEC_BLENDV))] "TARGET_SSE4_1" "#" diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..e6f37527192 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -448,6 +448,18 @@ enum { PR_OPTIMIZE_FOR_SPEED = 4 }; +/* Check that X has a single def. */ + +static bool +reg_single_def_p (rtx x) +{ + if (!REG_P (x)) + return false; + + int regno = REGNO (x); + return (DF_REG_DEF_COUNT (regno) == 1 + && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno)); +} /* Replace all occurrences of OLD in *PX with NEW and try to simplify the resulting expression. Replace *PX with a new RTL expression if an @@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only + && (!reg_single_def_p (SET_SRC (def_set)) + || !reg_single_def_p (SET_DEST (def_set)))) + return false; + + /* Allow propagations into a loop only for reg-to-reg copies, since + replacing one register by another shouldn't increase the cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && (!reg_single_def_p (SET_SRC (def_set)) + || !reg_single_def_p (SET_DEST (def_set)))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1553,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1572,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1612,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1648,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index 89a46a933fa..dd2acd4eca9 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -6697,6 +6697,17 @@ simplify_subreg (machine_mode outermode, rtx op, } } + /* If OP is a vector comparison and the subreg is not changing the + number of elements or the size of the elements, change the result + of the comparison to the new mode. */ + if (COMPARISON_P (op) + && VECTOR_MODE_P (outermode) + && VECTOR_MODE_P (innermode) + && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)) + && known_eq (GET_MODE_UNIT_SIZE (outermode), + GET_MODE_UNIT_SIZE (innermode))) + return simplify_gen_relational (GET_CODE (op), outermode, innermode, + XEXP (op, 0), XEXP (op, 1)); return NULL_RTX; } diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }