For possible reductions, ifconv currently handles if the addition is on one side of the if. But in the case of PR 119920, the reduction addition is on both sides of the if. E.g. ``` if (_27 == 0) goto <bb 14>; [50.00%] else goto <bb 13>; [50.00%]
<bb 14> a_29 = b_14(D) + a_17; goto <bb 15>; [100.00%] <bb 13> a_28 = c_12(D) + a_17; <bb 15> # a_30 = PHI <a_28(13), a_29(14)> ``` Which ifcvt converts into: ``` _34 = _32 + _33; a_15 = (int) _34; _23 = _4 == 0; _37 = _33 + _35; a_13 = (int) _37; a_5 = _23 ? a_15 : a_13; ``` But the vectorizer does not recognize this as a reduction. To fix this, we should factor out the addition from the `if`. This allows us to get: ``` iftmp.0_7 = _22 ? b_13(D) : c_12(D); a_14 = iftmp.0_7 + a_18; ``` Which then the vectorizer recognizes as a reduction. In the case of PR 112324 and PR 110015, it is similar but with MAX_EXPR reduction instead of an addition. Note while this should be done in phiopt, there are regressions due to other passes not able to handle the factored out cases (see linked bug to PR 64700). I have not had time to fix all of the passes that could handle the addition being in the if/then/else rather than being outside yet. So this is I thought it would be useful just to have a localized version in ifconv which is then only used for the vectorizer. Bootstrapped and tested on x86_64-linux-gnu. PR tree-optimization/119920 PR tree-optimization/112324 PR tree-optimization/110015 gcc/ChangeLog: * tree-if-conv.cc (find_different_opnum): New function. (factor_out_operators): New function. (predicate_scalar_phi): Call factor_out_operators when there is only 2 elements of a phi. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-reduc-cond-1.c: New test. * gcc.dg/vect/vect-reduc-cond-2.c: New test. * gcc.dg/vect/vect-reduc-cond-3.c: New test. Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> --- gcc/testsuite/gcc.dg/vect/vect-reduc-cond-1.c | 59 ++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-cond-2.c | 61 ++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-cond-3.c | 56 ++++++ gcc/tree-if-conv.cc | 186 ++++++++++++++++++ 4 files changed, 362 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-cond-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-cond-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-cond-3.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-1.c new file mode 100644 index 00000000000..d8356b4685c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-1.c @@ -0,0 +1,59 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +/* PR tree-optimization/119920 */ + +#define N 32 + +unsigned int ub[N]; + +/* Test vectorization of reduction of unsigned-int. */ + +__attribute__ ((noinline, noipa)) +void init(void) +{ + #pragma GCC novector + for(int i = 0;i < N; i++) + ub[i] = i; +} + + +__attribute__ ((noinline, noipa)) +void main1 (unsigned int b, unsigned int c) +{ + int i; + unsigned int usum = 0; + + init(); + + /* Summation. */ + for (i = 0; i < N; i++) { + if ( ub[i] < N/2 ) + { + usum += b; + } + else + { + usum += c; + } + } + + /* check results: */ + /* __builtin_printf("%d : %d\n", usum, (N/2*b + N/2*c)); */ + if (usum != N/2*b + N/2*c) + abort (); +} + +int main (void) +{ + check_vect (); + + main1 (0, 0); + main1 (1, 1); + main1 (10, 1); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_int_add } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-2.c new file mode 100644 index 00000000000..80c1dba9fc1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-2.c @@ -0,0 +1,61 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-fdump-tree-ifcvt-details" } */ + +#include <stdarg.h> +#include "tree-vect.h" + +/* PR tree-optimization/119920 */ + +#define N 32 + +unsigned int ub[N]; +unsigned int ua[N]; + +/* Test vectorization of reduction of unsigned-int. */ + +__attribute__ ((noinline, noipa)) +void init(void) +{ + #pragma GCC novector + for(int i = 0;i < N; i++) { + ub[i] = i; + ua[i] = 1; + } +} + + +__attribute__ ((noinline, noipa)) +void main1 (unsigned int b, unsigned int c) +{ + int i; + unsigned int usum = 0; + + init(); + + /* Summation. */ + for (i = 0; i < N; i++) { + unsigned t = ua[i]; + if ( ub[i] < N/2 ) + usum += b * t; + else + usum += c * t; + } + + /* check results: */ + /* __builtin_printf("%d : %d\n", usum, (N/2*b*1 + N/2*c*1)); */ + if (usum != N/2*b + N/2*c) + abort (); +} + +int main (void) +{ + check_vect (); + + main1 (0, 0); + main1 (1, 1); + main1 (10, 1); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_int_add } } } } */ +/* { dg-final { scan-tree-dump-times "changed to factor operation out from COND_EXPR" 2 "ifcvt" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-3.c new file mode 100644 index 00000000000..e425869ccf9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-3.c @@ -0,0 +1,56 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +/* PR tree-optimization/112324 */ +/* PR tree-optimization/110015 */ + +#define N 32 + +int ub[N]; + +/* Test vectorization of reduction of int max with some extra code involed. */ + +__attribute__ ((noinline, noipa)) +void init(void) +{ + #pragma GCC novector + for(int i = 0;i < N; i++) + ub[i] = (i&4) && (i&1) ? -i : i; +} + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +__attribute__ ((noinline, noipa)) +void main1 (void) +{ + int i; + int max = 0; + + init(); + + /* Summation. */ + for (i = 0; i < N; i++) { + int tmp = ub[i]; + if (tmp < 0) + max = MAX (-tmp, max); + else + max = MAX (tmp, max); + } + + /* check results: */ + /* __builtin_printf("%d : %d\n", max, N); */ + if (max != N - 1) + abort (); +} + +int main (void) +{ + check_vect (); + + main1 (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_int_min_max } } } } */ diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 366e959fd77..ba25c19c50c 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -2114,6 +2114,187 @@ gen_phi_arg_condition (gphi *phi, ifcvt_arg_entry_t &arg, return cond; } +/* Find the operand which is different between ARG0_OP and ARG1_OP. + Returns the operand num where the difference is. + Set NEWARG0 and NEWARG1 from the different argument. + Returns -1 if none is found. + If ARG0_OP/ARG1_OP is commutative also try swapping the + two commutative operands and return the operand number where + the difference happens in ARG0_OP. */ + +static int +find_different_opnum (const gimple_match_op &arg0_op, + const gimple_match_op &arg1_op, + tree *new_arg0, tree *new_arg1) +{ + unsigned opnum = -1; + unsigned first; + first = first_commutative_argument (arg1_op.code, arg1_op.type); + for (unsigned i = 0; i < arg0_op.num_ops; i++) + { + if (!operand_equal_for_phi_arg_p (arg0_op.ops[i], + arg1_op.ops[i])) + { + /* Can handle only one non equal operand. */ + if (opnum != -1u) + { + /* Though if opnum is right before i and opnum is equal + to the first communtative argument, handle communtative + specially. */ + if (i == opnum + 1 && opnum == first) + goto commutative; + return -1; + } + opnum = i; + } + } + /* If all operands are equal only do this is there was single + operand. */ + if (opnum == -1u) + { + if (arg0_op.num_ops != 1) + return -1; + opnum = 0; + } + *new_arg0 = arg0_op.ops[opnum]; + *new_arg1 = arg1_op.ops[opnum]; + return opnum; + +/* Handle commutative operations. */ +commutative: + gcc_assert (first != -1u); + + /* Check the rest of the arguments to make sure they are the same. */ + for (unsigned i = first + 2; i < arg0_op.num_ops; i++) + if (!operand_equal_for_phi_arg_p (arg0_op.ops[i], + arg1_op.ops[i])) + return -1; + + /* If the arg0[first+1] and arg1[first] are the same + then the one which is different is arg0[first] and arg1[first+1] + return first since this is based on arg0. */ + if (operand_equal_for_phi_arg_p (arg0_op.ops[first + 1], + arg1_op.ops[first])) + { + *new_arg0 = arg0_op.ops[first]; + *new_arg1 = arg1_op.ops[first + 1]; + return first; + } + /* If the arg0[first] and arg1[first+1] are the same + then the one which is different is arg0[first+1] and arg1[first] + return first+1 since this is based on arg0. */ + if (operand_equal_for_phi_arg_p (arg0_op.ops[first], + arg1_op.ops[first + 1])) + { + *new_arg0 = arg0_op.ops[first + 1]; + *new_arg1 = arg1_op.ops[first]; + return first + 1; + } + return -1; +} + +/* Factors out an operation from *ARG0 and *ARG1 and + create the new statement at GSI. *RES is the + result of that new statement. Update *ARG0 and *ARG1 + and *RES to the new values if the factoring happened. + Loops until all of the factoring is completed. */ + +static void +factor_out_operators (tree *res, gimple_stmt_iterator *gsi, + tree *arg0, tree *arg1, gphi *phi) +{ + gimple_match_op arg0_op, arg1_op; + bool repeated = false; + +again: + if (TREE_CODE (*arg0) != SSA_NAME || TREE_CODE (*arg1) != SSA_NAME) + return; + + if (operand_equal_p (*arg0, *arg1)) + return; + + /* If either args have > 1 use, then this transformation actually + increases the number of expressions evaluated at runtime. */ + if (repeated + ? (!has_zero_uses (*arg0) || !has_zero_uses (*arg1)) + : (!has_single_use (*arg0) || !has_single_use (*arg1))) + return; + + gimple *arg0_def_stmt = SSA_NAME_DEF_STMT (*arg0); + if (!gimple_extract_op (arg0_def_stmt, &arg0_op)) + return; + + /* At this point there should be no ssa names occuring in abnormals. */ + gcc_assert (!arg0_op.operands_occurs_in_abnormal_phi ()); + + gimple *arg1_def_stmt = SSA_NAME_DEF_STMT (*arg1); + if (!gimple_extract_op (arg1_def_stmt, &arg1_op)) + return; + + /* At this point there should be no ssa names occuring in abnormals. */ + gcc_assert (!arg1_op.operands_occurs_in_abnormal_phi ()); + + /* No factoring can happen if the codes are different + or the number operands. */ + if (arg1_op.code != arg0_op.code + || arg1_op.num_ops != arg0_op.num_ops) + return; + + tree new_arg0, new_arg1; + int opnum = find_different_opnum (arg0_op, arg1_op, &new_arg0, &new_arg1); + if (opnum == -1) + return; + + if (!types_compatible_p (TREE_TYPE (new_arg0), TREE_TYPE (new_arg1))) + return; + tree new_res = make_ssa_name (TREE_TYPE (new_arg0), NULL); + + /* Create the operation stmt if possible and insert it. */ + + gimple_match_op new_op = arg0_op; + new_op.ops[opnum] = new_res; + gimple_seq seq = NULL; + tree result = *res; + result = maybe_push_res_to_seq (&new_op, &seq, result); + + /* If we can't create the new statement, release the temp name + and return back. */ + if (!result) + { + release_ssa_name (new_res); + return; + } + gsi_insert_seq_before (gsi, seq, GSI_CONTINUE_LINKING); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "PHI "); + print_generic_expr (dump_file, gimple_phi_result (phi)); + fprintf (dump_file, + " changed to factor operation out from COND_EXPR.\n"); + fprintf (dump_file, "New stmt with OPERATION that defines "); + print_generic_expr (dump_file, result); + fprintf (dump_file, ".\n"); + } + + /* Remove the old operation(s) that has single use. */ + gimple_stmt_iterator gsi_for_def; + + gsi_for_def = gsi_for_stmt (arg0_def_stmt); + gsi_remove (&gsi_for_def, true); + release_defs (arg0_def_stmt); + gsi_for_def = gsi_for_stmt (arg1_def_stmt); + gsi_remove (&gsi_for_def, true); + release_defs (arg1_def_stmt); + + /* Update the arguments and try again. */ + *arg0 = new_arg0; + *arg1 = new_arg1; + *res = new_res; + repeated = true; + goto again; +} + /* Create the smallest nested conditional possible. On pre-order we record which conditionals are live, and on post-order rewrite the chain by removing already active conditions. @@ -2293,6 +2474,11 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi, bool loop_versioned) arg0 = gimple_phi_arg_def (phi, 0); arg1 = gimple_phi_arg_def (phi, 1); } + + /* Factor out operand if possible. This can only be done easily + for PHI with 2 elements. */ + factor_out_operators (&res, gsi, &arg0, &arg1, phi); + if (is_cond_scalar_reduction (phi, &reduc, arg0, arg1, &op0, &op1, false, &has_nop, &nop_reduc)) -- 2.43.0