Re: [PATCH] forwprop, v2: Canonicalize atomic fetch_op op x to op_fetch or vice versa [PR98737]

2022-01-13 Thread Richard Biener via Gcc-patches
On Thu, 13 Jan 2022, Jakub Jelinek wrote:

> On Thu, Jan 13, 2022 at 04:07:20PM +0100, Richard Biener wrote:
> > I'm mostly concerned about the replace_uses_by use.  forwprop
> > will go over newly emitted stmts and thus the hypothetical added
> > 
> > lhs2 = d;
> > 
> > record the copy and schedule the stmt for removal, substituting 'd'
> > in each use as it goes along the function and folding them.  It's
> > a bit iffy (and maybe has unintended side-effects in odd cases)
> > to trample around and fold stuff behind that flows back.
> > 
> > I'd always vote to simplify the folding code so it's easier to
> > maintain and not micro-optimize there since it's not going to be
> > a hot part of the compiler.
> 
> Ok.  So like this?
> 
> 2022-01-13  Jakub Jelinek  
> 
>   PR target/98737
>   * tree-ssa-forwprop.c (simplify_builtin_call): Canonicalize
>   __atomic_fetch_op (p, x, y) op x into __atomic_op_fetch (p, x, y)
>   and __atomic_op_fetch (p, x, y) iop x into
>   __atomic_fetch_op (p, x, y).
> 
>   * gcc.dg/tree-ssa/pr98737-1.c: New test.
>   * gcc.dg/tree-ssa/pr98737-2.c: New test.
> 
> --- gcc/tree-ssa-forwprop.c.jj2022-01-11 23:11:23.467275019 +0100
> +++ gcc/tree-ssa-forwprop.c   2022-01-13 18:09:50.318625915 +0100
> @@ -1241,12 +1241,19 @@ constant_pointer_difference (tree p1, tr
> memset (p + 4, ' ', 3);
> into
> memcpy (p, "abcd   ", 7);
> -   call if the latter can be stored by pieces during expansion.  */
> +   call if the latter can be stored by pieces during expansion.
> +
> +   Also canonicalize __atomic_fetch_op (p, x, y) op x
> +   to __atomic_op_fetch (p, x, y) or
> +   __atomic_op_fetch (p, x, y) iop x
> +   to __atomic_fetch_op (p, x, y) when possible (also __sync).  */
>  
>  static bool
>  simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
>  {
>gimple *stmt1, *stmt2 = gsi_stmt (*gsi_p);
> +  enum built_in_function other_atomic = END_BUILTINS;
> +  enum tree_code atomic_op = ERROR_MARK;
>tree vuse = gimple_vuse (stmt2);
>if (vuse == NULL)
>  return false;
> @@ -1448,6 +1455,290 @@ simplify_builtin_call (gimple_stmt_itera
>   }
>   }
>break;
> +
> + #define CASE_ATOMIC(NAME, OTHER, OP) \
> +case BUILT_IN_##NAME##_1:
> \
> +case BUILT_IN_##NAME##_2:
> \
> +case BUILT_IN_##NAME##_4:
> \
> +case BUILT_IN_##NAME##_8:
> \
> +case BUILT_IN_##NAME##_16:   
> \
> +  atomic_op = OP;
> \
> +  other_atomic   \
> + = (enum built_in_function) (BUILT_IN_##OTHER##_1\
> + + (DECL_FUNCTION_CODE (callee2) \
> +- BUILT_IN_##NAME##_1)); \
> +  goto handle_atomic_fetch_op;
> +
> +CASE_ATOMIC (ATOMIC_FETCH_ADD, ATOMIC_ADD_FETCH, PLUS_EXPR)
> +CASE_ATOMIC (ATOMIC_FETCH_SUB, ATOMIC_SUB_FETCH, MINUS_EXPR)
> +CASE_ATOMIC (ATOMIC_FETCH_AND, ATOMIC_AND_FETCH, BIT_AND_EXPR)
> +CASE_ATOMIC (ATOMIC_FETCH_XOR, ATOMIC_XOR_FETCH, BIT_XOR_EXPR)
> +CASE_ATOMIC (ATOMIC_FETCH_OR, ATOMIC_OR_FETCH, BIT_IOR_EXPR)
> +
> +CASE_ATOMIC (SYNC_FETCH_AND_ADD, SYNC_ADD_AND_FETCH, PLUS_EXPR)
> +CASE_ATOMIC (SYNC_FETCH_AND_SUB, SYNC_SUB_AND_FETCH, MINUS_EXPR)
> +CASE_ATOMIC (SYNC_FETCH_AND_AND, SYNC_AND_AND_FETCH, BIT_AND_EXPR)
> +CASE_ATOMIC (SYNC_FETCH_AND_XOR, SYNC_XOR_AND_FETCH, BIT_XOR_EXPR)
> +CASE_ATOMIC (SYNC_FETCH_AND_OR, SYNC_OR_AND_FETCH, BIT_IOR_EXPR)
> +
> +CASE_ATOMIC (ATOMIC_ADD_FETCH, ATOMIC_FETCH_ADD, MINUS_EXPR)
> +CASE_ATOMIC (ATOMIC_SUB_FETCH, ATOMIC_FETCH_SUB, PLUS_EXPR)
> +CASE_ATOMIC (ATOMIC_XOR_FETCH, ATOMIC_FETCH_XOR, BIT_XOR_EXPR)
> +
> +CASE_ATOMIC (SYNC_ADD_AND_FETCH, SYNC_FETCH_AND_ADD, MINUS_EXPR)
> +CASE_ATOMIC (SYNC_SUB_AND_FETCH, SYNC_FETCH_AND_SUB, PLUS_EXPR)
> +CASE_ATOMIC (SYNC_XOR_AND_FETCH, SYNC_FETCH_AND_XOR, BIT_XOR_EXPR)
> +
> +#undef CASE_ATOMIC
> +
> +handle_atomic_fetch_op:
> +  if (gimple_call_num_args (stmt2) >= 2 && gimple_call_lhs (stmt2))
> + {
> +   tree lhs2 = gimple_call_lhs (stmt2), lhsc = lhs2;
> +   tree arg = gimple_call_arg (stmt2, 1);
> +   gimple *use_stmt, *cast_stmt = NULL;
> +   use_operand_p use_p;
> +   tree ndecl = builtin_decl_explicit (other_atomic);
> +
> +   if (ndecl == NULL_TREE || !single_imm_use (lhs2, _p, _stmt))
> + break;
> +
> +   if (gimple_assign_cast_p (use_stmt))
> + {
> +   cast_stmt = use_stmt;
> +   lhsc = gimple_assign_lhs (cast_stmt);
> +   if (lhsc == NULL_TREE
> +   || !INTEGRAL_TYPE_P (TREE_TYPE (lhsc))
> +   || 

[PATCH] forwprop, v2: Canonicalize atomic fetch_op op x to op_fetch or vice versa [PR98737]

2022-01-13 Thread Jakub Jelinek via Gcc-patches
On Thu, Jan 13, 2022 at 04:07:20PM +0100, Richard Biener wrote:
> I'm mostly concerned about the replace_uses_by use.  forwprop
> will go over newly emitted stmts and thus the hypothetical added
> 
> lhs2 = d;
> 
> record the copy and schedule the stmt for removal, substituting 'd'
> in each use as it goes along the function and folding them.  It's
> a bit iffy (and maybe has unintended side-effects in odd cases)
> to trample around and fold stuff behind that flows back.
> 
> I'd always vote to simplify the folding code so it's easier to
> maintain and not micro-optimize there since it's not going to be
> a hot part of the compiler.

Ok.  So like this?

2022-01-13  Jakub Jelinek  

PR target/98737
* tree-ssa-forwprop.c (simplify_builtin_call): Canonicalize
__atomic_fetch_op (p, x, y) op x into __atomic_op_fetch (p, x, y)
and __atomic_op_fetch (p, x, y) iop x into
__atomic_fetch_op (p, x, y).

* gcc.dg/tree-ssa/pr98737-1.c: New test.
* gcc.dg/tree-ssa/pr98737-2.c: New test.

--- gcc/tree-ssa-forwprop.c.jj  2022-01-11 23:11:23.467275019 +0100
+++ gcc/tree-ssa-forwprop.c 2022-01-13 18:09:50.318625915 +0100
@@ -1241,12 +1241,19 @@ constant_pointer_difference (tree p1, tr
memset (p + 4, ' ', 3);
into
memcpy (p, "abcd   ", 7);
-   call if the latter can be stored by pieces during expansion.  */
+   call if the latter can be stored by pieces during expansion.
+
+   Also canonicalize __atomic_fetch_op (p, x, y) op x
+   to __atomic_op_fetch (p, x, y) or
+   __atomic_op_fetch (p, x, y) iop x
+   to __atomic_fetch_op (p, x, y) when possible (also __sync).  */
 
 static bool
 simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
 {
   gimple *stmt1, *stmt2 = gsi_stmt (*gsi_p);
+  enum built_in_function other_atomic = END_BUILTINS;
+  enum tree_code atomic_op = ERROR_MARK;
   tree vuse = gimple_vuse (stmt2);
   if (vuse == NULL)
 return false;
@@ -1448,6 +1455,290 @@ simplify_builtin_call (gimple_stmt_itera
}
}
   break;
+
+ #define CASE_ATOMIC(NAME, OTHER, OP) \
+case BUILT_IN_##NAME##_1:  \
+case BUILT_IN_##NAME##_2:  \
+case BUILT_IN_##NAME##_4:  \
+case BUILT_IN_##NAME##_8:  \
+case BUILT_IN_##NAME##_16: \
+  atomic_op = OP;  \
+  other_atomic \
+   = (enum built_in_function) (BUILT_IN_##OTHER##_1\
+   + (DECL_FUNCTION_CODE (callee2) \
+  - BUILT_IN_##NAME##_1)); \
+  goto handle_atomic_fetch_op;
+
+CASE_ATOMIC (ATOMIC_FETCH_ADD, ATOMIC_ADD_FETCH, PLUS_EXPR)
+CASE_ATOMIC (ATOMIC_FETCH_SUB, ATOMIC_SUB_FETCH, MINUS_EXPR)
+CASE_ATOMIC (ATOMIC_FETCH_AND, ATOMIC_AND_FETCH, BIT_AND_EXPR)
+CASE_ATOMIC (ATOMIC_FETCH_XOR, ATOMIC_XOR_FETCH, BIT_XOR_EXPR)
+CASE_ATOMIC (ATOMIC_FETCH_OR, ATOMIC_OR_FETCH, BIT_IOR_EXPR)
+
+CASE_ATOMIC (SYNC_FETCH_AND_ADD, SYNC_ADD_AND_FETCH, PLUS_EXPR)
+CASE_ATOMIC (SYNC_FETCH_AND_SUB, SYNC_SUB_AND_FETCH, MINUS_EXPR)
+CASE_ATOMIC (SYNC_FETCH_AND_AND, SYNC_AND_AND_FETCH, BIT_AND_EXPR)
+CASE_ATOMIC (SYNC_FETCH_AND_XOR, SYNC_XOR_AND_FETCH, BIT_XOR_EXPR)
+CASE_ATOMIC (SYNC_FETCH_AND_OR, SYNC_OR_AND_FETCH, BIT_IOR_EXPR)
+
+CASE_ATOMIC (ATOMIC_ADD_FETCH, ATOMIC_FETCH_ADD, MINUS_EXPR)
+CASE_ATOMIC (ATOMIC_SUB_FETCH, ATOMIC_FETCH_SUB, PLUS_EXPR)
+CASE_ATOMIC (ATOMIC_XOR_FETCH, ATOMIC_FETCH_XOR, BIT_XOR_EXPR)
+
+CASE_ATOMIC (SYNC_ADD_AND_FETCH, SYNC_FETCH_AND_ADD, MINUS_EXPR)
+CASE_ATOMIC (SYNC_SUB_AND_FETCH, SYNC_FETCH_AND_SUB, PLUS_EXPR)
+CASE_ATOMIC (SYNC_XOR_AND_FETCH, SYNC_FETCH_AND_XOR, BIT_XOR_EXPR)
+
+#undef CASE_ATOMIC
+
+handle_atomic_fetch_op:
+  if (gimple_call_num_args (stmt2) >= 2 && gimple_call_lhs (stmt2))
+   {
+ tree lhs2 = gimple_call_lhs (stmt2), lhsc = lhs2;
+ tree arg = gimple_call_arg (stmt2, 1);
+ gimple *use_stmt, *cast_stmt = NULL;
+ use_operand_p use_p;
+ tree ndecl = builtin_decl_explicit (other_atomic);
+
+ if (ndecl == NULL_TREE || !single_imm_use (lhs2, _p, _stmt))
+   break;
+
+ if (gimple_assign_cast_p (use_stmt))
+   {
+ cast_stmt = use_stmt;
+ lhsc = gimple_assign_lhs (cast_stmt);
+ if (lhsc == NULL_TREE
+ || !INTEGRAL_TYPE_P (TREE_TYPE (lhsc))
+ || (TYPE_PRECISION (TREE_TYPE (lhsc))
+ != TYPE_PRECISION (TREE_TYPE (lhs2)))
+ || !single_imm_use (lhsc, _p, _stmt))
+   {
+ use_stmt = cast_stmt;
+ cast_stmt = NULL;
+ lhsc = lhs2;