Hi, Please refer to below link for previous threads. https://gcc.gnu.org/ml/gcc-patches/2019-03/msg00348.html
Comparing to patch v2, I've moved up the vector operation target check upward together with vector type target check. Besides, I ran bootstrap and regtest on powerpc64-linux-gnu (BE), updated testcases' requirements and options for robustness. Is it OK for GCC10? gcc/ChangeLog 2019-03-20 Kewen Lin <li...@gcc.gnu.org> PR target/88497 * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of GIMPLE_BINARY_RHS check and gimple_visited_p check, call new function undistribute_bitref_for_vector. (undistribute_bitref_for_vector): New function. (cleanup_vinfo_map): Likewise. (unsigned_cmp): Likewise. gcc/testsuite/ChangeLog 2019-03-20 Kewen Lin <li...@gcc.gnu.org> * gcc.dg/tree-ssa/pr88497-1.c: New test. * gcc.dg/tree-ssa/pr88497-2.c: Likewise. * gcc.dg/tree-ssa/pr88497-3.c: Likewise. * gcc.dg/tree-ssa/pr88497-4.c: Likewise. * gcc.dg/tree-ssa/pr88497-5.c: Likewise. --- gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 44 +++++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 33 ++++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 33 ++++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 33 ++++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 33 ++++ gcc/tree-ssa-reassoc.c | 306 +++++++++++++++++++++++++++++- 6 files changed, 477 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c new file mode 100644 index 0000000..99c9af8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ +/* { dg-require-effective-target powerpc_vsx_ok { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-options "-O2 -ffast-math -mvsx -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ + +/* To test reassoc can undistribute vector bit_field_ref summation. + + arg1 and arg2 are two arrays whose elements of type vector double. + Assuming: + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], + + Then: + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, + + reassoc transforms + + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] + + V3[0] + V3[1]; + + into: + + T = V0 + V1 + V2 + V3 + accumulator += T[0] + T[1]; + + Fewer bit_field_refs, only two for 128 or more bits vector. */ + +typedef double v2df __attribute__ ((vector_size (16))); +double +test (double accumulator, v2df arg1[], v2df arg2[]) +{ + v2df temp; + temp = arg1[0] * arg2[0]; + accumulator += temp[0] + temp[1]; + temp = arg1[1] * arg2[1]; + accumulator += temp[0] + temp[1]; + temp = arg1[2] * arg2[2]; + accumulator += temp[0] + temp[1]; + temp = arg1[3] * arg2[3]; + accumulator += temp[0] + temp[1]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c new file mode 100644 index 0000000..61ed0bf5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ + +/* To test reassoc can undistribute vector bit_field_ref on multiplication. + + v1, v2, v3, v4 of type vector float. + + reassoc transforms + + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * + v2[0] * v2[1] * v2[2] * v2[3] * + v3[0] * v3[1] * v3[2] * v3[3] * + v4[0] * v4[1] * v4[2] * v4[3] ; + + into: + + T = v1 * v2 * v3 * v4; + accumulator *= T[0] * T[1] * T[2] * T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef float v4si __attribute__((vector_size(16))); +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c new file mode 100644 index 0000000..3790afc --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & + v2[0] & v2[1] & v2[2] & v2[3] & + v3[0] & v3[1] & v3[2] & v3[3] & + v4[0] & v4[1] & v4[2] & v4[3] ; + + into: + + T = v1 & v2 & v3 & v4; + accumulator &= T[0] & T[1] & T[2] & T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c new file mode 100644 index 0000000..1864aad --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | + v2[0] | v2[1] | v2[2] | v2[3] | + v3[0] | v3[1] | v3[2] | v3[3] | + v4[0] | v4[1] | v4[2] | v4[3] ; + + into: + + T = v1 | v2 | v3 | v4; + accumulator |= T[0] | T[1] | T[2] | T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c new file mode 100644 index 0000000..f747372 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -ffast-math" } */ +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; + + into: + + T = v1 ^ v2 ^ v3 ^ v4; + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c index e1c4dfe..a6cd85a 100644 --- a/gcc/tree-ssa-reassoc.c +++ b/gcc/tree-ssa-reassoc.c @@ -1772,6 +1772,295 @@ undistribute_ops_list (enum tree_code opcode, return changed; } +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ +struct v_info +{ + auto_vec<unsigned HOST_WIDE_INT, 32> offsets; + auto_vec<unsigned, 32> ops_indexes; +}; + +typedef struct v_info *v_info_ptr; + +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ +static int +unsigned_cmp (const void *p_i, const void *p_j) +{ + if (*(const unsigned HOST_WIDE_INT *) p_i + >= *(const unsigned HOST_WIDE_INT *) p_j) + return 1; + else + return -1; +} + +/* Cleanup hash map for VECTOR information. */ +static void +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map) +{ + for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin (); + it != info_map.end (); ++it) + { + v_info_ptr info = (*it).second; + delete info; + (*it).second = NULL; + } +} + +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] + is transformed to + Vs = (V1 + V2 + ... + Vn) + Vs[0] + Vs[1] + ... + Vs[k] + + The basic steps are listed below: + + 1) Check the addition chain *OPS by looking those summands coming from + VECTOR bit_field_ref on VECTOR type. Put the information into + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. + + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are + continous, they can cover the whole VECTOR perfectly without any holes. + Obtain one VECTOR list which contain candidates to be transformed. + + 3) Build the addition statements for all VECTOR candidates, generate + BIT_FIELD_REFs accordingly. + + TODO: + 1) The current implementation restrict all candidate VECTORs should have + the same VECTOR type, but it can be extended into different groups by + VECTOR types in future if any profitable cases found. + 2) The current implementation requires the whole VECTORs should be fully + covered, but it can be extended to support partial, checking adjacent + but not fill the whole, it may need some cost model to define the + boundary to do or not. +*/ +static bool +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops, + struct loop *loop) +{ + if (ops->length () <= 1) + return false; + + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) + return false; + + hash_map<tree, v_info_ptr> v_info_map; + operand_entry *oe1; + unsigned i; + + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the + information into map. */ + FOR_EACH_VEC_ELT (*ops, i, oe1) + { + enum tree_code dcode; + gimple *oe1def; + + if (TREE_CODE (oe1->op) != SSA_NAME) + continue; + oe1def = SSA_NAME_DEF_STMT (oe1->op); + if (!is_gimple_assign (oe1def)) + continue; + dcode = gimple_assign_rhs_code (oe1def); + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) + continue; + + tree rhs = gimple_op (oe1def, 1); + tree op0 = TREE_OPERAND (rhs, 0); + tree vec_type = TREE_TYPE (op0); + + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) + continue; + + tree op1 = TREE_OPERAND (rhs, 1); + tree op2 = TREE_OPERAND (rhs, 2); + + tree elem_type = TREE_TYPE (vec_type); + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + if (size != TREE_INT_CST_LOW (op1)) + continue; + + /* Ignore it if target machine can't support this VECTOR type. */ + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) + continue; + + /* Ignore it if target machine can't support this type of VECTOR + operation. */ + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) + continue; + + v_info_ptr *info_ptr = v_info_map.get (op0); + if (info_ptr) + { + v_info_ptr info = *info_ptr; + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); + info->ops_indexes.safe_push (i); + } + else + { + v_info_ptr info = new v_info; + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); + info->ops_indexes.safe_push (i); + v_info_map.put (op0, info); + } + } + + /* At least two VECTOR to combine. */ + if (v_info_map.elements () <= 1) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + /* Use the first VECTOR and its information as the reference. + Firstly, we should validate it, that is: + 1) sorted offsets are adjacent, no holes. + 2) can fill the whole VECTOR perfectly. */ + hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin (); + tree ref_vec = (*it).first; + v_info_ptr ref_info = (*it).second; + ref_info->offsets.qsort (unsigned_cmp); + tree vec_type = TREE_TYPE (ref_vec); + tree elem_type = TREE_TYPE (vec_type); + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + unsigned HOST_WIDE_INT curr; + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; + + /* Continous check. */ + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) + { + if (curr != (prev + elem_size)) + { + cleanup_vinfo_map (v_info_map); + return false; + } + prev = curr; + } + + /* Check whether fill the whole. */ + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + auto_vec<tree> vectors (v_info_map.elements ()); + vectors.quick_push (ref_vec); + + /* Use the ref_vec to filter others. */ + for (++it; it != v_info_map.end (); ++it) + { + tree vec = (*it).first; + v_info_ptr info = (*it).second; + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) + continue; + if (ref_info->offsets.length () != info->offsets.length ()) + continue; + bool same_offset = true; + info->offsets.qsort (unsigned_cmp); + for (unsigned i = 0; i < ref_info->offsets.length (); i++) + { + if (ref_info->offsets[i] != info->offsets[i]) + { + same_offset = false; + break; + } + } + if (!same_offset) + continue; + vectors.quick_push (vec); + } + + if (vectors.length () < 2) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + tree tr; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); + FOR_EACH_VEC_ELT (vectors, i, tr) + { + print_generic_expr (dump_file, tr); + fprintf (dump_file, " "); + } + fprintf (dump_file, "\n"); + } + + /* Build the sum for all candidate VECTORs. */ + unsigned idx; + gimple *sum = NULL; + v_info_ptr info; + tree sum_vec = ref_vec; + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) + { + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); + info = *(v_info_map.get (tr)); + unsigned j; + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) + { + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); + gimple_set_visited (def, true); + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR + || opcode == BIT_IOR_EXPR) + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); + else if (opcode == MULT_EXPR) + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); + else + { + gcc_assert (opcode == BIT_AND_EXPR); + (*ops)[idx]->op + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); + } + (*ops)[idx]->rank = 0; + } + sum_vec = gimple_get_lhs (sum); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Generating addition -> "); + print_gimple_stmt (dump_file, sum, 0); + } + } + + /* Referring to any good shape VECTOR (here using ref_vec), generate the + BIT_FIELD_REF statements accordingly. */ + info = *(v_info_map.get (ref_vec)); + gcc_assert (sum); + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) + { + tree dst = make_ssa_name (elem_type); + gimple *gs + = gimple_build_assign (dst, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, elem_type, sum_vec, + TYPE_SIZE (elem_type), + bitsize_int (info->offsets[i]))); + insert_stmt_after (gs, sum); + update_stmt (gs); + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); + gimple_set_visited (def, true); + (*ops)[idx]->op = gimple_assign_lhs (gs); + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Generating bit_field_ref -> "); + print_gimple_stmt (dump_file, gs, 0); + } + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); + } + + cleanup_vinfo_map (v_info_map); + + return true; +} + /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison expression, examine the other OPS to see if any of them are comparisons of the same values, which we may be able to combine or eliminate. @@ -5880,11 +6169,6 @@ reassociate_bb (basic_block bb) tree lhs, rhs1, rhs2; enum tree_code rhs_code = gimple_assign_rhs_code (stmt); - /* If this is not a gimple binary expression, there is - nothing for us to do with it. */ - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) - continue; - /* If this was part of an already processed statement, we don't need to touch it again. */ if (gimple_visited_p (stmt)) @@ -5911,6 +6195,11 @@ reassociate_bb (basic_block bb) continue; } + /* If this is not a gimple binary expression, there is + nothing for us to do with it. */ + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) + continue; + lhs = gimple_assign_lhs (stmt); rhs1 = gimple_assign_rhs1 (stmt); rhs2 = gimple_assign_rhs2 (stmt); @@ -5950,6 +6239,13 @@ reassociate_bb (basic_block bb) optimize_ops_list (rhs_code, &ops); } + if (undistribute_bitref_for_vector (rhs_code, &ops, + loop_containing_stmt (stmt))) + { + ops.qsort (sort_by_operand_rank); + optimize_ops_list (rhs_code, &ops); + } + if (rhs_code == PLUS_EXPR && transform_add_to_multiply (&ops)) ops.qsort (sort_by_operand_rank); -- 2.7.4