On Tue, 2 Jul 2019 at 18:22, Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
> > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford
> > <richard.sandif...@arm.com> wrote:
> >>
> >> Thanks for fixing this.
> >>
> >> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
> >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> >> > index 89a46a933fa..79bd0cfbd28 100644
> >> > --- a/gcc/simplify-rtx.c
> >> > +++ b/gcc/simplify-rtx.c
> >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,
> >> >       }
> >> >      }
> >> >
> >> > +  /* If op is a vector comparison operator, rewrite it in a new mode.
> >> > +     This probably won't match, but may allow further simplifications.
> >> > +     Also check if number of elements and size of each element
> >> > +     match for outermode and innermode.  */
> >> > +
> >>
> >> Excess blank line after the comment.  IMO the second part of the comment
> >> reads too much like an afterthought.  How about:
> >>
> >>   /* If OP is a vector comparison and the subreg is not changing the
> >>      number of elements or the size of the elements, change the result
> >>      of the comparison to the new mode.  */
> >>
> >> > +  if (COMPARISON_P (op)
> >> > +      && VECTOR_MODE_P (outermode)
> >> > +      && VECTOR_MODE_P (innermode)
> >> > +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS 
> >> > (innermode)))
> >> > +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),
> >> > +                 GET_MODE_UNIT_SIZE (innermode))))
> >>
> >> Redundant brackets around the known_eq calls.
> >>
> >> > +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP 
> >> > (op, 1));
> >>
> >> This should use simplify_gen_relational, so that we try to simplify
> >> the new expression.
> > Does the attached version look OK ?
>
> OK with a suitable changelog (remember to post those!) if it passes testing.
The change to simplify_subreg regressed avx2-pr54700-1.C -;)

For following test-case:
__attribute__((noipa)) __v8sf
f7 (__v8si a, __v8sf b, __v8sf c)
{
  return a < 0 ? b : c;
}

Before patch, combine shows:
Trying 8, 9 -> 10:
    8: r87:V8SI=const_vector
    9: r89:V8SI=r87:V8SI>r90:V8SI
      REG_DEAD r90:V8SI
      REG_DEAD r87:V8SI
   10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104
      REG_DEAD r92:V8SF
      REG_DEAD r89:V8SI
      REG_DEAD r91:V8SF
Successfully matched this instruction:
(set (reg:V8SF 86)
    (unspec:V8SF [
            (reg:V8SF 92)
            (reg:V8SF 91)
            (subreg:V8SF (lt:V8SI (reg:V8SI 90)
                    (const_vector:V8SI [
                            (const_int 0 [0]) repeated x8
                        ])) 0)
        ] UNSPEC_BLENDV))
allowing combination of insns 8, 9 and 10

After applying patch, combine shows:

Trying 8, 9 -> 10:
    8: r87:V8SI=const_vector
    9: r89:V8SI=r87:V8SI>r90:V8SI
      REG_DEAD r90:V8SI
      REG_DEAD r87:V8SI
   10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104
      REG_DEAD r92:V8SF
      REG_DEAD r89:V8SI
      REG_DEAD r91:V8SF
Failed to match this instruction:
(set (reg:V8SF 86)
    (unspec:V8SF [
            (reg:V8SF 92)
            (reg:V8SF 91)
            (lt:V8SF (reg:V8SI 90)
                (const_vector:V8SI [
                        (const_int 0 [0]) repeated x8
                    ]))
        ] UNSPEC_BLENDV))

Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work.
Does it look OK ?

Testing the attached patch in progress.
(A quick comparison for i386.exp now shows no difference before/after patch).

Thanks,
Prathamesh
>
> Thanks,
> Richard
2019-07-03  Prathamesh Kulkarni  <prathamesh.kulka...@linaro.org>

        * fwprop.c (reg_single_def_p): New function.
        (propagate_rtx_1): Add unconditional else inside RTX_EXTRA case.
        (forward_propagate_into): New parameter reg_prop_only
        with default value false.
        Propagate def's src into loop only if SET_SRC and SET_DEST
        of def_set have single definitions.
        Likewise if reg_prop_only is set to true.
        (fwprop): New param fwprop_addr_p.
        Integrate fwprop_addr into fwprop.
        (fwprop_addr): Remove.
        (pass_rtl_fwprop_addr::execute): Call fwprop with arg set
        to true.
        (pass_rtl_fwprop::execute): Call fwprop with arg set to false.
        * simplify-rtx.c (simplify_subreg): Add case for vector comparison.
        * i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern.

testsuite/
        * gfortran.dg/pr88833.f90: New test. 

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d7d542524fb..5384fe218f9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16623,10 +16623,9 @@
        (unspec:VF_128_256
          [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
           (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-          (subreg:VF_128_256
-            (lt:<sseintvecmode>
+            (lt:VF_128_256
               (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
-              (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)]
+              (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]
          UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "#"
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 45703fe5f01..e6f37527192 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -448,6 +448,18 @@ enum {
   PR_OPTIMIZE_FOR_SPEED = 4
 };
 
+/* Check that X has a single def.  */
+
+static bool
+reg_single_def_p (rtx x)
+{
+  if (!REG_P (x))
+    return false;
+
+  int regno = REGNO (x);
+  return (DF_REG_DEF_COUNT (regno) == 1
+         && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno));
+}
 
 /* Replace all occurrences of OLD in *PX with NEW and try to simplify the
    resulting expression.  Replace *PX with a new RTL expression if an
@@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int 
flags)
          tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
                                     SUBREG_BYTE (x));
        }
+
+      else
+       {
+         rtvec vec;
+         rtvec newvec;
+         const char *fmt = GET_RTX_FORMAT (code);
+         rtx op;
+
+         for (int i = 0; fmt[i]; i++)
+           switch (fmt[i])
+             {
+             case 'E':
+               vec = XVEC (x, i);
+               newvec = vec;
+               for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+                 {
+                   op = RTVEC_ELT (vec, j);
+                   valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+                   if (op != RTVEC_ELT (vec, j))
+                     {
+                       if (newvec == vec)
+                         {
+                           newvec = shallow_copy_rtvec (vec);
+                           if (!tem)
+                             tem = shallow_copy_rtx (x);
+                           XVEC (tem, i) = newvec;
+                         }
+                       RTVEC_ELT (newvec, j) = op;
+                     }
+                 }
+               break;
+
+             case 'e':
+               if (XEXP (x, i))
+                 {
+                   op = XEXP (x, i);
+                   valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+                   if (op != XEXP (x, i))
+                     {
+                       if (!tem)
+                         tem = shallow_copy_rtx (x);
+                       XEXP (tem, i) = op;
+                     }
+                 }
+               break;
+             }
+       }
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn 
*def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only
+      && (!reg_single_def_p (SET_SRC (def_set))
+         || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
+  /* Allow propagations into a loop only for reg-to-reg copies, since
+     replacing one register by another shouldn't increase the cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && (!reg_single_def_p (SET_SRC (def_set))
+         || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1553,7 @@ gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1572,16 @@ fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-       if (DF_REF_TYPE (use) == DF_REF_REG_USE
-           || DF_REF_BB (use)->loop_father == NULL
-           /* The outer most loop is not really a loop.  */
-           || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-         forward_propagate_into (use);
+       {
+         if (DF_REF_TYPE (use) == DF_REF_REG_USE
+             || DF_REF_BB (use)->loop_father == NULL
+             /* The outer most loop is not really a loop.  */
+             || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+           forward_propagate_into (use, fwprop_addr_p);
+
+         else if (fwprop_addr_p)
+           forward_propagate_into (use, false);
+       }
     }
 
   fwprop_done ();
@@ -1537,7 +1612,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-       break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-       if (DF_REF_TYPE (use) != DF_REF_REG_USE
-           && DF_REF_BB (use)->loop_father != NULL
-           /* The outer most loop is not really a loop.  */
-           && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-         forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1648,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 89a46a933fa..dd2acd4eca9 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -6697,6 +6697,17 @@ simplify_subreg (machine_mode outermode, rtx op,
        }
     }
 
+  /* If OP is a vector comparison and the subreg is not changing the
+     number of elements or the size of the elements, change the result
+     of the comparison to the new mode.  */
+  if (COMPARISON_P (op)
+      && VECTOR_MODE_P (outermode)
+      && VECTOR_MODE_P (innermode)
+      && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))
+      && known_eq (GET_MODE_UNIT_SIZE (outermode),
+                   GET_MODE_UNIT_SIZE (innermode)))
+    return simplify_gen_relational (GET_CODE (op), outermode, innermode,
+                                   XEXP (op, 0), XEXP (op, 1));
   return NULL_RTX;
 }
 
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 
b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 00000000000..224e6ce5f3d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, 
(w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }

Reply via email to