On Tue, Nov 9, 2021 at 6:22 PM Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Tue, Nov 9, 2021 at 3:09 AM liuhongt <hongtao....@intel.com> wrote: > > > > This will enable transformation like > > > > - # sum1_50 = PHI <prephitmp_64(13), 0(4)> > > - # sum2_52 = PHI <sum2_21(13), 0(4)> > > + # sum1_50 = PHI <_87(13), 0(4)> > > + # sum2_52 = PHI <_89(13), 0(4)> > > # ivtmp_62 = PHI <ivtmp_61(13), 64(4)> > > i.2_7 = (long unsigned int) i_49; > > _8 = i.2_7 * 8; > > ... > > vec1_i_38 = vec1_29 >> _10; > > vec2_i_39 = vec2_31 >> _10; > > _11 = vec1_i_38 & 1; > > - _63 = tmp_37 ^ sum1_50; > > - prephitmp_64 = _11 == 0 ? sum1_50 : _63; > > + _ifc__86 = _11 != 0 ? tmp_37 : 0; > > + _87 = sum1_50 ^ _ifc__86; > > _12 = vec2_i_39 & 1; > > : > > > > so that vectorizer won't failed due to > > > > /* If this isn't a nested cycle or if the nested cycle reduction value > > is used ouside of the inner loop we cannot handle uses of the reduction > > value. */ > > if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "reduction used in loop.\n"); > > return NULL; > > } > > > > Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,} > > Ok for trunk? > > > > gcc/ChangeLog: > > > > PR tree-optimization/103126 > > * tree-if-conv.c (is_cond_scalar_reduction): Handle > > BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR. > > (convert_scalar_cond_reduction): Ditto. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/ifcvt-reduction-logic-op.c: New test. > > --- > > .../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++ > > gcc/tree-if-conv.c | 19 +++-- > > 2 files changed, 92 insertions(+), 7 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > > > > diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > > b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > > new file mode 100644 > > index 00000000000..eeb822d5d43 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > > @@ -0,0 +1,80 @@ > > +/* PR tree-optimization/103126. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" > > } */ > > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } > > } */ > > +#include<stdint.h> > > + > > +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, > > uint64_t *__restrict ans, > > + int64_t n) > > +{ > > + int64_t i; > > + uint64_t vec1, sum1; > > + uint64_t vec2, sum2; > > + > > + while (n > 0) { > > + sum1 = 0; > > + vec1 = a[n]; > > + sum2 = 0; > > + vec2 = b[n]; > > + > > + for (i = 0; i < 64; i++) { > > + uint64_t tmp = mat[i]; > > + uint64_t vec1_i = (vec1 >> i); > > + uint64_t vec2_i = (vec2 >> i); > > + sum1 ^= (vec1_i & 1) ? tmp : 0; > > + if (vec2_i&1) sum2 ^= tmp; > > + } > > + *ans++ ^= sum1; n--; > > + *ans++ ^= sum2; n--; > > + } > > +} > > + > > +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, > > uint64_t *__restrict ans, > > + int64_t n) > > +{ > > + int64_t i; > > + uint64_t vec1, sum1; > > + uint64_t vec2, sum2; > > + > > + while (n > 0) { > > + sum1 = 0; > > + vec1 = a[n]; > > + sum2 = 0; > > + vec2 = b[n]; > > + > > + for (i = 0; i < 64; i++) { > > + uint64_t tmp = mat[i]; > > + uint64_t vec1_i = (vec1 >> i); > > + uint64_t vec2_i = (vec2 >> i); > > + sum1 |= (vec1_i & 1) ? tmp : 0; > > + if (vec2_i&1) sum2 |= tmp; > > + } > > + *ans++ |= sum1; n--; > > + *ans++ |= sum2; n--; > > + } > > +} > > + > > +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, > > uint64_t *__restrict ans, > > + int64_t n) > > +{ > > + int64_t i; > > + uint64_t vec1, sum1; > > + uint64_t vec2, sum2; > > + > > + while (n > 0) { > > + sum1 = -1; > > + vec1 = a[n]; > > + sum2 = 0; > > + vec2 = b[n]; > > + > > + for (i = 0; i < 64; i++) { > > + uint64_t tmp = mat[i]; > > + uint64_t vec1_i = (vec1 >> i); > > + uint64_t vec2_i = (vec2 >> i); > > + sum1 &= (vec1_i & 1) ? tmp : -1; > > + if (vec2_i&1) sum2 &= tmp; > > + } > > + *ans++ &= sum1; n--; > > + *ans++ &= sum2; n--; > > + } > > +} > > diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c > > index b165dc0c17f..7df1103ff89 100644 > > --- a/gcc/tree-if-conv.c > > +++ b/gcc/tree-if-conv.c > > @@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple > > **reduc, tree arg_0, tree arg_1, > > reduction_op = gimple_assign_rhs_code (stmt); > > } > > > > - if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR) > > + if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR > > + && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR > > + && reduction_op != BIT_AND_EXPR) > > Please put each && on a separate line Changed. > > > return false; > > r_op1 = gimple_assign_rhs1 (stmt); > > r_op2 = gimple_assign_rhs2 (stmt); > > @@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple > > **reduc, tree arg_0, tree arg_1, > > > > /* Make R_OP1 to hold reduction variable. */ > > if (r_nop2 == PHI_RESULT (header_phi) > > - && reduction_op == PLUS_EXPR) > > + && commutative_tree_code (reduction_op)) > > { > > std::swap (r_op1, r_op2); > > std::swap (r_nop1, r_nop2); > > @@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, > > gimple_stmt_iterator *gsi, > > tree rhs1 = gimple_assign_rhs1 (reduc); > > tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_"); > > tree c; > > - tree zero = build_zero_cst (TREE_TYPE (rhs1)); > > + enum tree_code reduction_op = gimple_assign_rhs_code (reduc); > > + tree op_nochange = reduction_op != BIT_AND_EXPR > > + ? build_zero_cst (TREE_TYPE (rhs1)) > > + : build_minus_one_cst (TREE_TYPE (rhs1)); > > maybe export neutral_op_for_reduction and use it here (supply NULL > initial_value)? Changed(didn't know there's such function.) > > Otherwise looks OK. > > Thanks, > Richard. > > > gimple_seq stmts = NULL; > > > > if (dump_file && (dump_flags & TDF_DETAILS)) > > @@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, > > gimple_stmt_iterator *gsi, > > of reduction rhs. */ > > c = fold_build_cond_expr (TREE_TYPE (rhs1), > > unshare_expr (cond), > > - swap ? zero : op1, > > - swap ? op1 : zero); > > + swap ? op_nochange : op1, > > + swap ? op1 : op_nochange); > > > > /* Create assignment stmt and insert it at GSI. */ > > new_assign = gimple_build_assign (tmp, c); > > gsi_insert_before (gsi, new_assign, GSI_SAME_STMT); > > - /* Build rhs for unconditional increment/decrement. */ > > - rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc), > > + /* Build rhs for unconditional increment/decrement/logic_operation. */ > > + rhs = gimple_build (&stmts, reduction_op, > > TREE_TYPE (rhs1), op0, tmp); > > > > if (has_nop) > > -- > > 2.18.1 > >
Here's the patch i'm going to check in. -- BR, Hongtao
From 41b806b99bb4e7bf760bb0a4902ae426e2596fd5 Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao....@intel.com> Date: Mon, 8 Nov 2021 15:49:17 +0800 Subject: [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior. This will enable transformation like - # sum1_50 = PHI <prephitmp_64(13), 0(4)> - # sum2_52 = PHI <sum2_21(13), 0(4)> + # sum1_50 = PHI <_87(13), 0(4)> + # sum2_52 = PHI <_89(13), 0(4)> # ivtmp_62 = PHI <ivtmp_61(13), 64(4)> i.2_7 = (long unsigned int) i_49; _8 = i.2_7 * 8; ... vec1_i_38 = vec1_29 >> _10; vec2_i_39 = vec2_31 >> _10; _11 = vec1_i_38 & 1; - _63 = tmp_37 ^ sum1_50; - prephitmp_64 = _11 == 0 ? sum1_50 : _63; + _ifc__86 = _11 != 0 ? tmp_37 : 0; + _87 = sum1_50 ^ _ifc__86; _12 = vec2_i_39 & 1; : so that vectorizer won't failed due to /* If this isn't a nested cycle or if the nested cycle reduction value is used ouside of the inner loop we cannot handle uses of the reduction value. */ if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "reduction used in loop.\n"); return NULL; } gcc/ChangeLog: PR tree-optimization/103126 * tree-vect-loop.c (neutral_op_for_reduction): Remove static. * tree-vectorizer.h (neutral_op_for_reduction): Declare. * tree-if-conv.c : Include tree-vectorizer.h. (is_cond_scalar_reduction): Handle BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR. (convert_scalar_cond_reduction): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/ifcvt-reduction-logic-op.c: New test. --- .../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++ gcc/tree-if-conv.c | 20 +++-- gcc/tree-vect-loop.c | 2 +- gcc/tree-vectorizer.h | 1 + 4 files changed, 95 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c new file mode 100644 index 00000000000..eeb822d5d43 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c @@ -0,0 +1,80 @@ +/* PR tree-optimization/103126. */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */ +#include<stdint.h> + +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans, + int64_t n) +{ + int64_t i; + uint64_t vec1, sum1; + uint64_t vec2, sum2; + + while (n > 0) { + sum1 = 0; + vec1 = a[n]; + sum2 = 0; + vec2 = b[n]; + + for (i = 0; i < 64; i++) { + uint64_t tmp = mat[i]; + uint64_t vec1_i = (vec1 >> i); + uint64_t vec2_i = (vec2 >> i); + sum1 ^= (vec1_i & 1) ? tmp : 0; + if (vec2_i&1) sum2 ^= tmp; + } + *ans++ ^= sum1; n--; + *ans++ ^= sum2; n--; + } +} + +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans, + int64_t n) +{ + int64_t i; + uint64_t vec1, sum1; + uint64_t vec2, sum2; + + while (n > 0) { + sum1 = 0; + vec1 = a[n]; + sum2 = 0; + vec2 = b[n]; + + for (i = 0; i < 64; i++) { + uint64_t tmp = mat[i]; + uint64_t vec1_i = (vec1 >> i); + uint64_t vec2_i = (vec2 >> i); + sum1 |= (vec1_i & 1) ? tmp : 0; + if (vec2_i&1) sum2 |= tmp; + } + *ans++ |= sum1; n--; + *ans++ |= sum2; n--; + } +} + +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans, + int64_t n) +{ + int64_t i; + uint64_t vec1, sum1; + uint64_t vec2, sum2; + + while (n > 0) { + sum1 = -1; + vec1 = a[n]; + sum2 = 0; + vec2 = b[n]; + + for (i = 0; i < 64; i++) { + uint64_t tmp = mat[i]; + uint64_t vec1_i = (vec1 >> i); + uint64_t vec2_i = (vec2 >> i); + sum1 &= (vec1_i & 1) ? tmp : -1; + if (vec2_i&1) sum2 &= tmp; + } + *ans++ &= sum1; n--; + *ans++ &= sum2; n--; + } +} diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c index b165dc0c17f..e88ddc9f788 100644 --- a/gcc/tree-if-conv.c +++ b/gcc/tree-if-conv.c @@ -120,6 +120,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-ssa-sccvn.h" #include "tree-cfgcleanup.h" #include "tree-ssa-dse.h" +#include "tree-vectorizer.h" /* Only handle PHIs with no more arguments unless we are asked to by simd pragma. */ @@ -1732,7 +1733,11 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, reduction_op = gimple_assign_rhs_code (stmt); } - if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR) + if (reduction_op != PLUS_EXPR + && reduction_op != MINUS_EXPR + && reduction_op != BIT_IOR_EXPR + && reduction_op != BIT_XOR_EXPR + && reduction_op != BIT_AND_EXPR) return false; r_op1 = gimple_assign_rhs1 (stmt); r_op2 = gimple_assign_rhs2 (stmt); @@ -1742,7 +1747,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, /* Make R_OP1 to hold reduction variable. */ if (r_nop2 == PHI_RESULT (header_phi) - && reduction_op == PLUS_EXPR) + && commutative_tree_code (reduction_op)) { std::swap (r_op1, r_op2); std::swap (r_nop1, r_nop2); @@ -1811,7 +1816,8 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, tree rhs1 = gimple_assign_rhs1 (reduc); tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_"); tree c; - tree zero = build_zero_cst (TREE_TYPE (rhs1)); + enum tree_code reduction_op = gimple_assign_rhs_code (reduc); + tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL); gimple_seq stmts = NULL; if (dump_file && (dump_flags & TDF_DETAILS)) @@ -1824,14 +1830,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, of reduction rhs. */ c = fold_build_cond_expr (TREE_TYPE (rhs1), unshare_expr (cond), - swap ? zero : op1, - swap ? op1 : zero); + swap ? op_nochange : op1, + swap ? op1 : op_nochange); /* Create assignment stmt and insert it at GSI. */ new_assign = gimple_build_assign (tmp, c); gsi_insert_before (gsi, new_assign, GSI_SAME_STMT); - /* Build rhs for unconditional increment/decrement. */ - rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc), + /* Build rhs for unconditional increment/decrement/logic_operation. */ + rhs = gimple_build (&stmts, reduction_op, TREE_TYPE (rhs1), op0, tmp); if (has_nop) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index a28bb6321d7..fa4cf88ce51 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -3330,7 +3330,7 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) of the scalar elements. If the reduction has just a single initial value then INITIAL_VALUE is that value, otherwise it is null. */ -static tree +tree neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value) { switch (code) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index b552e9dccce..51ab21896aa 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2120,6 +2120,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *, tree); /* In tree-vect-loop.c. */ +extern tree neutral_op_for_reduction (tree, tree_code, tree); extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo); bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *); /* Used in tree-vect-loop-manip.c */ -- 2.18.1