Re: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-31 Thread Hongtao Liu via Gcc-patches
On Mon, May 31, 2021 at 6:14 PM Richard Biener
 wrote:
>
> On Thu, May 27, 2021 at 9:05 AM Hongtao Liu  wrote:
> >
> > On Wed, May 26, 2021 at 8:41 PM Richard Biener
> >  wrote:
> > >
> > > On Wed, May 26, 2021 at 7:06 AM Hongtao Liu  wrote:
> > > >
> > > > On Tue, May 25, 2021 at 6:24 PM Richard Biener
> > > >  wrote:
> > > > >
> > > > > On Mon, May 24, 2021 at 11:52 AM Hongtao Liu  
> > > > > wrote:
> > > > > >
> > > > > > Hi:
> > > > > >   Details described in PR.
> > > > > >   Bootstrapped and regtest on
> > > > > > x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
> > > > > > -march=cascadelake,-march=cascadelake}
> > > > > >   Ok for trunk?
> > > > >
> > > > > +static tree
> > > > > +strip_nop_cond_scalar_reduction (bool has_nop, tree op)
> > > > > +{
> > > > > +  if (!has_nop)
> > > > > +return op;
> > > > > +
> > > > > +  if (TREE_CODE (op) != SSA_NAME)
> > > > > +return NULL_TREE;
> > > > > +
> > > > > +  gimple* stmt = SSA_NAME_DEF_STMT (op);
> > > > > +  if (!stmt
> > > > > +  || gimple_code (stmt) != GIMPLE_ASSIGN
> > > > > +  || gimple_has_volatile_ops (stmt)
> > > > > +  || gimple_assign_rhs_code (stmt) != NOP_EXPR)
> > > > > +return NULL_TREE;
> > > > > +
> > > > > +  return gimple_assign_rhs1 (stmt);
> > > > >
> > > > > this allows arbitrary conversions where the comment suggests you
> > > > > only want to allow conversions to the same precision but different 
> > > > > sign.
> > > > > Sth like
> > > > >
> > > > >   gassign *stmt = safe_dyn_cast  (SSA_NAME_DEF_STMT (op));
> > > > >   if (!stmt
> > > > >   || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
> > > > >   || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
> > > > > (gimple_assign_rhs1 (stmt
> > > > > return NULL_TREE;
> > > > >
> > Changed.
> > > > > +  if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
> > > > > + || gimple_code (stmt) != GIMPLE_ASSIGN
> > > > > + || gimple_has_volatile_ops (stmt))
> > > > > +   return false;
> > > > >
> > > > > !is_gimple_assign (stmt) instead of gimple_code (stmt) != 
> > > > > GIMPLE_ASSIGN
> > > > >
> > Changed.
> > > > > the gimple_has_volatile_ops check is superfluous given you restrict
> > > > > the assign code.
> > > > >
> > Changed.
> > > > > +  /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
> > > > > +  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
> > > > > +   {
> > > > > + gimple *use_stmt = USE_STMT (use_p);
> > > > > + if (is_gimple_debug (use_stmt))
> > > > > +   continue;
> > > > > + if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
> > > > > +   continue;
> > > > > + if (gimple_code (use_stmt) != GIMPLE_PHI)
> > > > > +   return false;
> > > > >
> > > > > can the last check be use_stmt == phi since we should have the
> > > > > PHI readily available?
> > > > >
> > Yes, changed.
> > > > > @@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc,
> > > > > gimple_stmt_iterator *gsi,
> > > > >rhs = fold_build2 (gimple_assign_rhs_code (reduc),
> > > > >  TREE_TYPE (rhs1), op0, tmp);
> > > > >
> > > > > +  if (has_nop)
> > > > > +{
> > > > > +  /* Create assignment nop_rhs = op0 +/- _ifc_.  */
> > > > > +  tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, 
> > > > > "_nop_");
> > > > > +  gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs);
> > > > > +  gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT);
> > > > > +  /* Rebuild rhs for nop_expr.  */
> > > > > +  rhs = fold_build1 (NOP_EXPR,
> > > > > +TREE_TYPE (gimple_assign_lhs (nop_reduc)),
> > > > > +nop_rhs);
> > > > > +
> > > > > +  /* Delete nop_reduc.  */
> > > > > +  stmt_it = gsi_for_stmt (nop_reduc);
> > > > > +  gsi_remove (_it, true);
> > > > > +  release_defs (nop_reduc);
> > > > > +}
> > > > > +
> > > > >
> > > > > hmm, the whole function could be cleaned up with sth like
> > > > >
> > > > >  /* Build rhs for unconditional increment/decrement.  */
> > > > >  gimple_seq stmts = NULL;
> > > > >  rhs = gimple_build (, gimple_assing_rhs_code (reduc),
> > > > > TREE_TYPE (rhs1), op0, tmp);
> > > > >  if (has_nop)
> > > > >rhs = gimple_convert (, TREE_TYPE (gimple_assign_lhs
> > > > > (nop_reduc)), rhs);
> > > > >  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > >
> > > > > plus in the caller moving the
> > > > >
> > > > >   new_stmt = gimple_build_assign (res, rhs);
> > > > >   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> > > > >
> > > > > to the else branch as well as the folding done on new_stmt (maybe 
> > > > > return
> > > > > new_stmt instead of rhs from convert_scalar_cond_reduction.
> > > > Eventually, we needed to assign rhs to res, and with an extra mov stmt
> > > > from rhs to res, the vectorizer failed.
> > > > the only difference in 

Re: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-31 Thread Richard Biener via Gcc-patches
On Thu, May 27, 2021 at 9:05 AM Hongtao Liu  wrote:
>
> On Wed, May 26, 2021 at 8:41 PM Richard Biener
>  wrote:
> >
> > On Wed, May 26, 2021 at 7:06 AM Hongtao Liu  wrote:
> > >
> > > On Tue, May 25, 2021 at 6:24 PM Richard Biener
> > >  wrote:
> > > >
> > > > On Mon, May 24, 2021 at 11:52 AM Hongtao Liu  wrote:
> > > > >
> > > > > Hi:
> > > > >   Details described in PR.
> > > > >   Bootstrapped and regtest on
> > > > > x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
> > > > > -march=cascadelake,-march=cascadelake}
> > > > >   Ok for trunk?
> > > >
> > > > +static tree
> > > > +strip_nop_cond_scalar_reduction (bool has_nop, tree op)
> > > > +{
> > > > +  if (!has_nop)
> > > > +return op;
> > > > +
> > > > +  if (TREE_CODE (op) != SSA_NAME)
> > > > +return NULL_TREE;
> > > > +
> > > > +  gimple* stmt = SSA_NAME_DEF_STMT (op);
> > > > +  if (!stmt
> > > > +  || gimple_code (stmt) != GIMPLE_ASSIGN
> > > > +  || gimple_has_volatile_ops (stmt)
> > > > +  || gimple_assign_rhs_code (stmt) != NOP_EXPR)
> > > > +return NULL_TREE;
> > > > +
> > > > +  return gimple_assign_rhs1 (stmt);
> > > >
> > > > this allows arbitrary conversions where the comment suggests you
> > > > only want to allow conversions to the same precision but different sign.
> > > > Sth like
> > > >
> > > >   gassign *stmt = safe_dyn_cast  (SSA_NAME_DEF_STMT (op));
> > > >   if (!stmt
> > > >   || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
> > > >   || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
> > > > (gimple_assign_rhs1 (stmt
> > > > return NULL_TREE;
> > > >
> Changed.
> > > > +  if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
> > > > + || gimple_code (stmt) != GIMPLE_ASSIGN
> > > > + || gimple_has_volatile_ops (stmt))
> > > > +   return false;
> > > >
> > > > !is_gimple_assign (stmt) instead of gimple_code (stmt) != GIMPLE_ASSIGN
> > > >
> Changed.
> > > > the gimple_has_volatile_ops check is superfluous given you restrict
> > > > the assign code.
> > > >
> Changed.
> > > > +  /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
> > > > +  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
> > > > +   {
> > > > + gimple *use_stmt = USE_STMT (use_p);
> > > > + if (is_gimple_debug (use_stmt))
> > > > +   continue;
> > > > + if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
> > > > +   continue;
> > > > + if (gimple_code (use_stmt) != GIMPLE_PHI)
> > > > +   return false;
> > > >
> > > > can the last check be use_stmt == phi since we should have the
> > > > PHI readily available?
> > > >
> Yes, changed.
> > > > @@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc,
> > > > gimple_stmt_iterator *gsi,
> > > >rhs = fold_build2 (gimple_assign_rhs_code (reduc),
> > > >  TREE_TYPE (rhs1), op0, tmp);
> > > >
> > > > +  if (has_nop)
> > > > +{
> > > > +  /* Create assignment nop_rhs = op0 +/- _ifc_.  */
> > > > +  tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, 
> > > > "_nop_");
> > > > +  gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs);
> > > > +  gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT);
> > > > +  /* Rebuild rhs for nop_expr.  */
> > > > +  rhs = fold_build1 (NOP_EXPR,
> > > > +TREE_TYPE (gimple_assign_lhs (nop_reduc)),
> > > > +nop_rhs);
> > > > +
> > > > +  /* Delete nop_reduc.  */
> > > > +  stmt_it = gsi_for_stmt (nop_reduc);
> > > > +  gsi_remove (_it, true);
> > > > +  release_defs (nop_reduc);
> > > > +}
> > > > +
> > > >
> > > > hmm, the whole function could be cleaned up with sth like
> > > >
> > > >  /* Build rhs for unconditional increment/decrement.  */
> > > >  gimple_seq stmts = NULL;
> > > >  rhs = gimple_build (, gimple_assing_rhs_code (reduc),
> > > > TREE_TYPE (rhs1), op0, tmp);
> > > >  if (has_nop)
> > > >rhs = gimple_convert (, TREE_TYPE (gimple_assign_lhs
> > > > (nop_reduc)), rhs);
> > > >  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > >
> > > > plus in the caller moving the
> > > >
> > > >   new_stmt = gimple_build_assign (res, rhs);
> > > >   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> > > >
> > > > to the else branch as well as the folding done on new_stmt (maybe return
> > > > new_stmt instead of rhs from convert_scalar_cond_reduction.
> > > Eventually, we needed to assign rhs to res, and with an extra mov stmt
> > > from rhs to res, the vectorizer failed.
> > > the only difference in 166t.ifcvt between successfully vectorization
> > > and failed vectorization is below
> > >char * _24;
> > >char _25;
> > >unsigned char _ifc__29;
> > > +  unsigned char _30;
> > >
> > > [local count: 118111600]:
> > >if (n_10(D) != 0)
> > > @@ -70,7 +71,8 @@ char foo2 (char * a, char * c, int n)
> > >_5 = c_14(D) + _1;
> > 

Re: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-27 Thread Hongtao Liu via Gcc-patches
 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> > >
> > > to the else branch as well as the folding done on new_stmt (maybe return
> > > new_stmt instead of rhs from convert_scalar_cond_reduction.
> > Eventually, we needed to assign rhs to res, and with an extra mov stmt
> > from rhs to res, the vectorizer failed.
> > the only difference in 166t.ifcvt between successfully vectorization
> > and failed vectorization is below
> >char * _24;
> >char _25;
> >unsigned char _ifc__29;
> > +  unsigned char _30;
> >
> > [local count: 118111600]:
> >if (n_10(D) != 0)
> > @@ -70,7 +71,8 @@ char foo2 (char * a, char * c, int n)
> >_5 = c_14(D) + _1;
> >_6 = *_5;
> >_ifc__29 = _3 == _6 ? 1 : 0;
> > -  cnt_7 = cnt_18 + _ifc__29;
> > +  _30 = cnt_18 + _ifc__29;
> > +  cnt_7 = _30;
> >i_16 = i_20 + 1;
> >if (n_10(D) != i_16)
> >  goto ; [89.00%]
> > @@ -110,7 +112,7 @@ char foo2 (char * a, char * c, int n)
> >goto ; [100.00%]
> >
> > [local count: 105119324]:
> > -  # cnt_19 = PHI 
> > +  # cnt_19 = PHI <_30(3), cnt_27(15)>
> >_21 = (char) cnt_19;
> >
> > if we want to eliminate the extra move, gimple_build and
> > gimple_convert is not suitable since they create a new lhs, is there
> > any interface like gimple_build_assign but accept stmts?
>
> Hmm, OK.  So what you could do is replacing all uses of 'res'
> with 'rhs', alternatively replacing the last generated stmt LHS
> by 'res'.  Both is a bit hackish of course.  Usually the vectorizer
> just ignores copies like this but the reduction matching is
> unnecessarily strict here (but it's also a bit awkward to fix there).
>
> There's redundant_ssa_names which seems to be designed to
> handle propagating those out and there's the do_rpo_vn run,
> so I wonder why the stmt remains.  Now, for redundant_ssa_names
> you'd need to push a std::pair (res, rhs) to it in case rhs is an
> SSA name - does that help?
Yes, it worked.
>
> Richard.
> > >
> > > Richard.
> > >
> > > >   gcc/ChangeLog:
> > > >
> > > > PR tree-optimization/pr98365
> > > > * tree-if-conv.c (strip_nop_cond_scalar_reduction): New 
> > > > function.
> > > > (is_cond_scalar_reduction): Handle nop_expr in cond scalar 
> > > > reduction.
> > > > (convert_scalar_cond_reduction): Ditto.
> > > > (predicate_scalar_phi): Ditto.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR tree-optimization/pr98365
> > > > * gcc.target/i386/pr98365.c: New test.
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
> >
> >
> >
> > --
> > BR,
> > Hongtao


Update patch.

-- 
BR,
Hongtao
From f0593d1866af6f752e70e2ef2f1477471c252536 Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Wed, 6 Jan 2021 16:33:27 +0800
Subject: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr
 after/before scalar reduction.[PR98365]

gcc/ChangeLog:

	PR tree-optimization/pr98365
	* tree-if-conv.c (strip_nop_cond_scalar_reduction): New function.
	(is_cond_scalar_reduction): Handle nop_expr in cond scalar reduction.
	(convert_scalar_cond_reduction): Ditto.
	(predicate_scalar_phi): Ditto.

gcc/testsuite/ChangeLog:

	PR tree-optimization/pr98365
	* gcc.target/i386/pr98365.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr98365.c |  22 
 gcc/tree-if-conv.c  | 142 +---
 2 files changed, 146 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98365.c

diff --git a/gcc/testsuite/gcc.target/i386/pr98365.c b/gcc/testsuite/gcc.target/i386/pr98365.c
new file mode 100644
index 000..652210dcdd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98365.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+short foo1 (short* a, short* c, int n)
+{
+  int i;
+  short cnt=0;
+  for (int i = 0;i != n; i++)
+if (a[i] == c[i])
+  cnt++;
+  return cnt;
+}
+
+char foo2 (char* a, char* c, int n)
+{
+  int i;
+  char cnt=0;
+  for (int i = 0;i != n; i++)
+if (a[i] == c[i])
+  cnt++;
+  return cnt;
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 716eae44a21..5eed087de36 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -1579,6 +1579,31 @@ if_convertible_loop_p (class loop *loop)
   return res;
 }
 
+/* Return reduc_1 if has_nop.
+
+   if (...)
+ 

Re: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-26 Thread Richard Biener via Gcc-patches
On Wed, May 26, 2021 at 7:06 AM Hongtao Liu  wrote:
>
> On Tue, May 25, 2021 at 6:24 PM Richard Biener
>  wrote:
> >
> > On Mon, May 24, 2021 at 11:52 AM Hongtao Liu  wrote:
> > >
> > > Hi:
> > >   Details described in PR.
> > >   Bootstrapped and regtest on
> > > x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
> > > -march=cascadelake,-march=cascadelake}
> > >   Ok for trunk?
> >
> > +static tree
> > +strip_nop_cond_scalar_reduction (bool has_nop, tree op)
> > +{
> > +  if (!has_nop)
> > +return op;
> > +
> > +  if (TREE_CODE (op) != SSA_NAME)
> > +return NULL_TREE;
> > +
> > +  gimple* stmt = SSA_NAME_DEF_STMT (op);
> > +  if (!stmt
> > +  || gimple_code (stmt) != GIMPLE_ASSIGN
> > +  || gimple_has_volatile_ops (stmt)
> > +  || gimple_assign_rhs_code (stmt) != NOP_EXPR)
> > +return NULL_TREE;
> > +
> > +  return gimple_assign_rhs1 (stmt);
> >
> > this allows arbitrary conversions where the comment suggests you
> > only want to allow conversions to the same precision but different sign.
> > Sth like
> >
> >   gassign *stmt = safe_dyn_cast  (SSA_NAME_DEF_STMT (op));
> >   if (!stmt
> >   || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
> >   || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
> > (gimple_assign_rhs1 (stmt
> > return NULL_TREE;
> >
> > +  if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
> > + || gimple_code (stmt) != GIMPLE_ASSIGN
> > + || gimple_has_volatile_ops (stmt))
> > +   return false;
> >
> > !is_gimple_assign (stmt) instead of gimple_code (stmt) != GIMPLE_ASSIGN
> >
> > the gimple_has_volatile_ops check is superfluous given you restrict
> > the assign code.
> >
> > +  /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
> > +  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
> > +   {
> > + gimple *use_stmt = USE_STMT (use_p);
> > + if (is_gimple_debug (use_stmt))
> > +   continue;
> > + if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
> > +   continue;
> > + if (gimple_code (use_stmt) != GIMPLE_PHI)
> > +   return false;
> >
> > can the last check be use_stmt == phi since we should have the
> > PHI readily available?
> >
> > @@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc,
> > gimple_stmt_iterator *gsi,
> >rhs = fold_build2 (gimple_assign_rhs_code (reduc),
> >  TREE_TYPE (rhs1), op0, tmp);
> >
> > +  if (has_nop)
> > +{
> > +  /* Create assignment nop_rhs = op0 +/- _ifc_.  */
> > +  tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_nop_");
> > +  gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs);
> > +  gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT);
> > +  /* Rebuild rhs for nop_expr.  */
> > +  rhs = fold_build1 (NOP_EXPR,
> > +TREE_TYPE (gimple_assign_lhs (nop_reduc)),
> > +nop_rhs);
> > +
> > +  /* Delete nop_reduc.  */
> > +  stmt_it = gsi_for_stmt (nop_reduc);
> > +  gsi_remove (_it, true);
> > +  release_defs (nop_reduc);
> > +}
> > +
> >
> > hmm, the whole function could be cleaned up with sth like
> >
> >  /* Build rhs for unconditional increment/decrement.  */
> >  gimple_seq stmts = NULL;
> >  rhs = gimple_build (, gimple_assing_rhs_code (reduc),
> > TREE_TYPE (rhs1), op0, tmp);
> >  if (has_nop)
> >rhs = gimple_convert (, TREE_TYPE (gimple_assign_lhs
> > (nop_reduc)), rhs);
> >  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >
> > plus in the caller moving the
> >
> >   new_stmt = gimple_build_assign (res, rhs);
> >   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> >
> > to the else branch as well as the folding done on new_stmt (maybe return
> > new_stmt instead of rhs from convert_scalar_cond_reduction.
> Eventually, we needed to assign rhs to res, and with an extra mov stmt
> from rhs to res, the vectorizer failed.
> the only difference in 166t.ifcvt between successfully vectorization
> and failed vectorization is below
>char * _24;
>char _25;
>unsigned char _ifc__29;
> +  unsigned char _30;
>
> [local count: 118111600]:
>if (n_10(D) != 0)
> @@ -70,7 +71,8 @@ char foo2 (char * a, char * c, int n)
>_5 = c_14(D) + _1;
>_6 = *_5;
>_ifc__29 = _3 == _6 ? 1 : 0;
> -  cnt_7 = cnt_18 + _ifc__29;
> +  _30 = cnt_18 + _ifc__29;
> +  cnt_7 = _30;
>i_16 = i_20 + 1;
>if (n_10(D) != i_16)
>  goto ; [89.00%]
> @@ -110,7 +112,7 @@ char foo2 (char * a, char * c, int n)
>goto ; [100.00%]
>
> [local count: 105119324]:
> -  # cnt_19 = PHI 
> +  # cnt_19 = PHI <_30(3), cnt_27(15)>
>_21 = (char) cnt_19;
>
> if we want to eliminate the extra move, gimple_build and
> gimple_convert is not suitable since they create a new lhs, is there
> any interface like gimple_build_assign but accept stmts?

Hmm, OK.  So what you could do is replacing all uses of 'res'
with 

Re: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-25 Thread Hongtao Liu via Gcc-patches
On Tue, May 25, 2021 at 6:24 PM Richard Biener
 wrote:
>
> On Mon, May 24, 2021 at 11:52 AM Hongtao Liu  wrote:
> >
> > Hi:
> >   Details described in PR.
> >   Bootstrapped and regtest on
> > x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
> > -march=cascadelake,-march=cascadelake}
> >   Ok for trunk?
>
> +static tree
> +strip_nop_cond_scalar_reduction (bool has_nop, tree op)
> +{
> +  if (!has_nop)
> +return op;
> +
> +  if (TREE_CODE (op) != SSA_NAME)
> +return NULL_TREE;
> +
> +  gimple* stmt = SSA_NAME_DEF_STMT (op);
> +  if (!stmt
> +  || gimple_code (stmt) != GIMPLE_ASSIGN
> +  || gimple_has_volatile_ops (stmt)
> +  || gimple_assign_rhs_code (stmt) != NOP_EXPR)
> +return NULL_TREE;
> +
> +  return gimple_assign_rhs1 (stmt);
>
> this allows arbitrary conversions where the comment suggests you
> only want to allow conversions to the same precision but different sign.
> Sth like
>
>   gassign *stmt = safe_dyn_cast  (SSA_NAME_DEF_STMT (op));
>   if (!stmt
>   || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
>   || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
> (gimple_assign_rhs1 (stmt
> return NULL_TREE;
>
> +  if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
> + || gimple_code (stmt) != GIMPLE_ASSIGN
> + || gimple_has_volatile_ops (stmt))
> +   return false;
>
> !is_gimple_assign (stmt) instead of gimple_code (stmt) != GIMPLE_ASSIGN
>
> the gimple_has_volatile_ops check is superfluous given you restrict
> the assign code.
>
> +  /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
> +  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
> +   {
> + gimple *use_stmt = USE_STMT (use_p);
> + if (is_gimple_debug (use_stmt))
> +   continue;
> + if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
> +   continue;
> + if (gimple_code (use_stmt) != GIMPLE_PHI)
> +   return false;
>
> can the last check be use_stmt == phi since we should have the
> PHI readily available?
>
> @@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc,
> gimple_stmt_iterator *gsi,
>rhs = fold_build2 (gimple_assign_rhs_code (reduc),
>  TREE_TYPE (rhs1), op0, tmp);
>
> +  if (has_nop)
> +{
> +  /* Create assignment nop_rhs = op0 +/- _ifc_.  */
> +  tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_nop_");
> +  gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs);
> +  gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT);
> +  /* Rebuild rhs for nop_expr.  */
> +  rhs = fold_build1 (NOP_EXPR,
> +TREE_TYPE (gimple_assign_lhs (nop_reduc)),
> +nop_rhs);
> +
> +  /* Delete nop_reduc.  */
> +  stmt_it = gsi_for_stmt (nop_reduc);
> +  gsi_remove (_it, true);
> +  release_defs (nop_reduc);
> +}
> +
>
> hmm, the whole function could be cleaned up with sth like
>
>  /* Build rhs for unconditional increment/decrement.  */
>  gimple_seq stmts = NULL;
>  rhs = gimple_build (, gimple_assing_rhs_code (reduc),
> TREE_TYPE (rhs1), op0, tmp);
>  if (has_nop)
>rhs = gimple_convert (, TREE_TYPE (gimple_assign_lhs
> (nop_reduc)), rhs);
>  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>
> plus in the caller moving the
>
>   new_stmt = gimple_build_assign (res, rhs);
>   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>
> to the else branch as well as the folding done on new_stmt (maybe return
> new_stmt instead of rhs from convert_scalar_cond_reduction.
Eventually, we needed to assign rhs to res, and with an extra mov stmt
from rhs to res, the vectorizer failed.
the only difference in 166t.ifcvt between successfully vectorization
and failed vectorization is below
   char * _24;
   char _25;
   unsigned char _ifc__29;
+  unsigned char _30;

[local count: 118111600]:
   if (n_10(D) != 0)
@@ -70,7 +71,8 @@ char foo2 (char * a, char * c, int n)
   _5 = c_14(D) + _1;
   _6 = *_5;
   _ifc__29 = _3 == _6 ? 1 : 0;
-  cnt_7 = cnt_18 + _ifc__29;
+  _30 = cnt_18 + _ifc__29;
+  cnt_7 = _30;
   i_16 = i_20 + 1;
   if (n_10(D) != i_16)
 goto ; [89.00%]
@@ -110,7 +112,7 @@ char foo2 (char * a, char * c, int n)
   goto ; [100.00%]

[local count: 105119324]:
-  # cnt_19 = PHI 
+  # cnt_19 = PHI <_30(3), cnt_27(15)>
   _21 = (char) cnt_19;

if we want to eliminate the extra move, gimple_build and
gimple_convert is not suitable since they create a new lhs, is there
any interface like gimple_build_assign but accept stmts?
>
> Richard.
>
> >   gcc/ChangeLog:
> >
> > PR tree-optimization/pr98365
> > * tree-if-conv.c (strip_nop_cond_scalar_reduction): New function.
> > (is_cond_scalar_reduction): Handle nop_expr in cond scalar 
> > reduction.
> > (convert_scalar_cond_reduction): Ditto.
> > (predicate_scalar_phi): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR 

Re: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-25 Thread Richard Biener via Gcc-patches
On Mon, May 24, 2021 at 11:52 AM Hongtao Liu  wrote:
>
> Hi:
>   Details described in PR.
>   Bootstrapped and regtest on
> x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
> -march=cascadelake,-march=cascadelake}
>   Ok for trunk?

+static tree
+strip_nop_cond_scalar_reduction (bool has_nop, tree op)
+{
+  if (!has_nop)
+return op;
+
+  if (TREE_CODE (op) != SSA_NAME)
+return NULL_TREE;
+
+  gimple* stmt = SSA_NAME_DEF_STMT (op);
+  if (!stmt
+  || gimple_code (stmt) != GIMPLE_ASSIGN
+  || gimple_has_volatile_ops (stmt)
+  || gimple_assign_rhs_code (stmt) != NOP_EXPR)
+return NULL_TREE;
+
+  return gimple_assign_rhs1 (stmt);

this allows arbitrary conversions where the comment suggests you
only want to allow conversions to the same precision but different sign.
Sth like

  gassign *stmt = safe_dyn_cast  (SSA_NAME_DEF_STMT (op));
  if (!stmt
  || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
  || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE
(gimple_assign_rhs1 (stmt
return NULL_TREE;

+  if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
+ || gimple_code (stmt) != GIMPLE_ASSIGN
+ || gimple_has_volatile_ops (stmt))
+   return false;

!is_gimple_assign (stmt) instead of gimple_code (stmt) != GIMPLE_ASSIGN

the gimple_has_volatile_ops check is superfluous given you restrict
the assign code.

+  /* Check that R_NOP1 is used in nop_stmt or in PHI only.  */
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1)
+   {
+ gimple *use_stmt = USE_STMT (use_p);
+ if (is_gimple_debug (use_stmt))
+   continue;
+ if (use_stmt == SSA_NAME_DEF_STMT (r_op1))
+   continue;
+ if (gimple_code (use_stmt) != GIMPLE_PHI)
+   return false;

can the last check be use_stmt == phi since we should have the
PHI readily available?

@@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc,
gimple_stmt_iterator *gsi,
   rhs = fold_build2 (gimple_assign_rhs_code (reduc),
 TREE_TYPE (rhs1), op0, tmp);

+  if (has_nop)
+{
+  /* Create assignment nop_rhs = op0 +/- _ifc_.  */
+  tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_nop_");
+  gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs);
+  gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT);
+  /* Rebuild rhs for nop_expr.  */
+  rhs = fold_build1 (NOP_EXPR,
+TREE_TYPE (gimple_assign_lhs (nop_reduc)),
+nop_rhs);
+
+  /* Delete nop_reduc.  */
+  stmt_it = gsi_for_stmt (nop_reduc);
+  gsi_remove (_it, true);
+  release_defs (nop_reduc);
+}
+

hmm, the whole function could be cleaned up with sth like

 /* Build rhs for unconditional increment/decrement.  */
 gimple_seq stmts = NULL;
 rhs = gimple_build (, gimple_assing_rhs_code (reduc),
TREE_TYPE (rhs1), op0, tmp);
 if (has_nop)
   rhs = gimple_convert (, TREE_TYPE (gimple_assign_lhs
(nop_reduc)), rhs);
 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);

plus in the caller moving the

  new_stmt = gimple_build_assign (res, rhs);
  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);

to the else branch as well as the folding done on new_stmt (maybe return
new_stmt instead of rhs from convert_scalar_cond_reduction.

Richard.

>   gcc/ChangeLog:
>
> PR tree-optimization/pr98365
> * tree-if-conv.c (strip_nop_cond_scalar_reduction): New function.
> (is_cond_scalar_reduction): Handle nop_expr in cond scalar reduction.
> (convert_scalar_cond_reduction): Ditto.
> (predicate_scalar_phi): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/pr98365
> * gcc.target/i386/pr98365.c: New test.
>
> --
> BR,
> Hongtao


[PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365]

2021-05-24 Thread Hongtao Liu via Gcc-patches
Hi:
  Details described in PR.
  Bootstrapped and regtest on
x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\
-march=cascadelake,-march=cascadelake}
  Ok for trunk?

  gcc/ChangeLog:

PR tree-optimization/pr98365
* tree-if-conv.c (strip_nop_cond_scalar_reduction): New function.
(is_cond_scalar_reduction): Handle nop_expr in cond scalar reduction.
(convert_scalar_cond_reduction): Ditto.
(predicate_scalar_phi): Ditto.

gcc/testsuite/ChangeLog:

PR tree-optimization/pr98365
* gcc.target/i386/pr98365.c: New test.

-- 
BR,
Hongtao
From 0248efe310454a31e4bbeb1430f907ec663c39ae Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Wed, 6 Jan 2021 16:33:27 +0800
Subject: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr
 after/before scalar reduction.[PR98365]

gcc/ChangeLog:

	PR tree-optimization/pr98365
	* tree-if-conv.c (strip_nop_cond_scalar_reduction): New function.
	(is_cond_scalar_reduction): Handle nop_expr in cond scalar reduction.
	(convert_scalar_cond_reduction): Ditto.
	(predicate_scalar_phi): Ditto.

gcc/testsuite/ChangeLog:

	PR tree-optimization/pr98365
	* gcc.target/i386/pr98365.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr98365.c |  22 
 gcc/tree-if-conv.c  | 131 +---
 2 files changed, 141 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98365.c

diff --git a/gcc/testsuite/gcc.target/i386/pr98365.c b/gcc/testsuite/gcc.target/i386/pr98365.c
new file mode 100644
index 000..652210dcdd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98365.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+short foo1 (short* a, short* c, int n)
+{
+  int i;
+  short cnt=0;
+  for (int i = 0;i != n; i++)
+if (a[i] == c[i])
+  cnt++;
+  return cnt;
+}
+
+char foo2 (char* a, char* c, int n)
+{
+  int i;
+  char cnt=0;
+  for (int i = 0;i != n; i++)
+if (a[i] == c[i])
+  cnt++;
+  return cnt;
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 716eae44a21..5811eb00478 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -1579,6 +1579,31 @@ if_convertible_loop_p (class loop *loop)
   return res;
 }
 
+/* Return reduc_1 if has_nop.
+
+   if (...)
+ tmp1 = (unsigned type) reduc_1;
+ tmp2 = tmp1 + rhs2;
+ reduc_3 = (signed type) tmp2.  */
+static tree
+strip_nop_cond_scalar_reduction (bool has_nop, tree op)
+{
+  if (!has_nop)
+return op;
+
+  if (TREE_CODE (op) != SSA_NAME)
+return NULL_TREE;
+
+  gimple* stmt = SSA_NAME_DEF_STMT (op);
+  if (!stmt
+  || gimple_code (stmt) != GIMPLE_ASSIGN
+  || gimple_has_volatile_ops (stmt)
+  || gimple_assign_rhs_code (stmt) != NOP_EXPR)
+return NULL_TREE;
+
+  return gimple_assign_rhs1 (stmt);
+}
+
 /* Returns true if def-stmt for phi argument ARG is simple increment/decrement
which is in predicated basic block.
In fact, the following PHI pattern is searching:
@@ -1595,9 +1620,10 @@ if_convertible_loop_p (class loop *loop)
 
 static bool
 is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
-			  tree *op0, tree *op1, bool extended)
+			  tree *op0, tree *op1, bool extended, bool* has_nop,
+			  gimple **nop_reduc)
 {
-  tree lhs, r_op1, r_op2;
+  tree lhs, r_op1, r_op2, r_nop1, r_nop2;
   gimple *stmt;
   gimple *header_phi = NULL;
   enum tree_code reduction_op;
@@ -1608,7 +1634,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
   use_operand_p use_p;
   edge e;
   edge_iterator ei;
-  bool result = false;
+  bool result = *has_nop = false;
   if (TREE_CODE (arg_0) != SSA_NAME || TREE_CODE (arg_1) != SSA_NAME)
 return false;
 
@@ -1656,18 +1682,78 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
 return false;
 
   reduction_op = gimple_assign_rhs_code (stmt);
+
+/* Catch something like below
+
+ loop-header:
+ reduc_1 = PHI <..., reduc_2>
+ ...
+ if (...)
+ tmp1 = (unsigned type) reduc_1;
+ tmp2 = tmp1 + rhs2;
+ reduc_3 = (signed type) tmp2;
+
+ reduc_2 = PHI 
+
+ and convert to
+
+ reduc_2 = PHI <0, reduc_3>
+ tmp1 = (unsigned type)reduce_1;
+ ifcvt = cond_expr ? rhs2 : 0
+ tmp2 = tmp1 +/- ifcvt;
+ reduce_1 = (signed type)tmp2;  */
+
+  if (reduction_op == NOP_EXPR)
+{
+  lhs = gimple_assign_rhs1 (stmt);
+  if (TREE_CODE (lhs) != SSA_NAME
+	  || !has_single_use (lhs))
+	return false;
+
+  *nop_reduc = stmt;
+  stmt = SSA_NAME_DEF_STMT (lhs);
+  if (gimple_bb (stmt) != gimple_bb (*nop_reduc)
+	  || gimple_code (stmt) != GIMPLE_ASSIGN
+	  || gimple_has_volatile_ops (stmt))
+	return false;
+
+  *has_nop = true;
+  reduction_op = gimple_assign_rhs_code (stmt);
+}
+
   if