On Thu, 15 Aug 2019 at 01:50, Richard Sandiford <richard.sandif...@arm.com> wrote: > > Richard Biener <richard.guent...@gmail.com> writes: > > On Wed, Aug 14, 2019 at 6:49 PM Richard Biener > > <richard.guent...@gmail.com> wrote: > >> > >> On Wed, Aug 14, 2019 at 5:06 PM Prathamesh Kulkarni > >> <prathamesh.kulka...@linaro.org> wrote: > >> > > >> > Hi, > >> > The attached patch tries to fix PR86753. > >> > > >> > For following test: > >> > void > >> > f1 (int *restrict x, int *restrict y, int *restrict z) > >> > { > >> > for (int i = 0; i < 100; ++i) > >> > x[i] = y[i] ? z[i] : 10; > >> > } > >> > > >> > vect dump shows: > >> > vect_cst__42 = { 0, ... }; > >> > vect_cst__48 = { 0, ... }; > >> > > >> > vect__4.7_41 = .MASK_LOAD (vectp_y.5_38, 4B, loop_mask_40); > >> > _4 = *_3; > >> > _5 = z_12(D) + _2; > >> > mask__35.8_43 = vect__4.7_41 != vect_cst__42; > >> > _35 = _4 != 0; > >> > vec_mask_and_46 = mask__35.8_43 & loop_mask_40; > >> > vect_iftmp.11_47 = .MASK_LOAD (vectp_z.9_44, 4B, vec_mask_and_46); > >> > iftmp.0_13 = 0; > >> > vect_iftmp.12_50 = VEC_COND_EXPR <vect__4.7_41 != vect_cst__48, > >> > vect_iftmp.11_47, vect_cst__49>; > >> > > >> > and following code-gen: > >> > L2: > >> > ld1w z0.s, p2/z, [x1, x3, lsl 2] > >> > cmpne p1.s, p3/z, z0.s, #0 > >> > cmpne p0.s, p2/z, z0.s, #0 > >> > ld1w z0.s, p0/z, [x2, x3, lsl 2] > >> > sel z0.s, p1, z0.s, z1.s > >> > > >> > We could reuse vec_mask_and_46 in vec_cond_expr since the conditions > >> > vect__4.7_41 != vect_cst__48 and vect__4.7_41 != vect_cst__42 > >> > are equivalent, and vect_iftmp.11_47 depends on vect__4.7_41 != > >> > vect_cst__48. > >> > > >> > I suppose in general for vec_cond_expr <C, T, E> if T comes from masked > >> > load, > >> > which is conditional on C, then we could reuse the mask used in load, > >> > in vec_cond_expr ? > >> > > >> > The patch maintains a hash_map cond_to_vec_mask > >> > from <cond, loop_mask -> vec_mask (with loop predicate applied). > >> > In prepare_load_store_mask, we record <cond, loop_mask> -> vec_mask & > >> > loop_mask, > >> > and in vectorizable_condition, we check if <cond, loop_mask> exists in > >> > cond_to_vec_mask > >> > and if found, the corresponding vec_mask is used as 1st operand of > >> > vec_cond_expr. > >> > > >> > <cond, loop_mask> is represented with cond_vmask_key, and the patch > >> > adds tree_cond_ops to represent condition operator and operands coming > >> > either from cond_expr > >> > or a gimple comparison stmt. If the stmt is not comparison, it returns > >> > <ne_expr, lhs, 0> and inserts that into cond_to_vec_mask. > >> > > >> > With patch, the redundant p1 is eliminated and sel uses p0 for above > >> > test. > >> > > >> > For following test: > >> > void > >> > f2 (int *restrict x, int *restrict y, int *restrict z, int fallback) > >> > { > >> > for (int i = 0; i < 100; ++i) > >> > x[i] = y[i] ? z[i] : fallback; > >> > } > >> > > >> > input to vectorizer has operands swapped in cond_expr: > >> > _36 = _4 != 0; > >> > iftmp.0_14 = .MASK_LOAD (_5, 32B, _36); > >> > iftmp.0_8 = _4 == 0 ? fallback_12(D) : iftmp.0_14; > >> > > >> > So we need to check for inverted condition in cond_to_vec_mask, > >> > and swap the operands. > >> > Does the patch look OK so far ? > >> > > >> > One major issue remaining with the patch is value numbering. > >> > Currently, it does value numbering for entire function using sccvn > >> > during start of vect pass, which is too expensive since we only need > >> > block based VN. I am looking into that. > >> > >> Why do you need it at all? We run VN on the if-converted loop bodies btw. > > This was my suggestion, but with the idea being to do the numbering > per-statement as we vectorise. We'll then see pattern statements too. > > That's important because we use pattern statements to set the right > vector boolean type (e.g. vect_recog_mask_conversion_pattern). > So some of the masks we care about don't exist after if converison. > > > Also I can't trivially see the equality of the masks and probably so > > can't VN. Is it that we just don't bother to apply loop_mask to > > VEC_COND but there's no harm if we do? > > Yeah. The idea of the optimisation is to decide when it's more profitable > to apply the loop mask, even though doing so isn't necessary. It would > be hard to do after vectorisation because the masks aren't equivalent. > We're relying on knowledge of how the vectoriser uses the result. Hi, Sorry for late response. This is an updated patch, that integrates block-based VN into vect pass. The patch (a) Exports visit_stmt (renamed to vn_visit_stmt), vn_bb_init to initialize VN state, and vn_bb_free to free it. (b) Calls vn_visit_stmt in vect_transform_stmt for value numbering stmts. We're only interested in obtaining value numbers, not eliminating redundancies. Does it look in the right direction ? I am not sure if the initialization in vn_bb_init is entirely correct.
PS: The patch seems to regress fmla_2.c. I am looking into it. Thanks, Prathamesh > > Thanks, > Richard
diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c index eb7e4be09e6..26a46757854 100644 --- a/gcc/tree-ssa-sccvn.c +++ b/gcc/tree-ssa-sccvn.c @@ -4795,8 +4795,8 @@ try_to_simplify (gassign *stmt) /* Visit and value number STMT, return true if the value number changed. */ -static bool -visit_stmt (gimple *stmt, bool backedges_varying_p = false) +bool +vn_visit_stmt (gimple *stmt, bool backedges_varying_p) { bool changed = false; @@ -6416,7 +6416,7 @@ process_bb (rpo_elim &avail, basic_block bb, } /* When not iterating force backedge values to varying. */ - visit_stmt (phi, !iterate_phis); + vn_visit_stmt (phi, !iterate_phis); if (virtual_operand_p (res)) continue; @@ -6513,7 +6513,7 @@ process_bb (rpo_elim &avail, basic_block bb, the visited flag in SSA_VAL. */ } - visit_stmt (gsi_stmt (gsi)); + vn_visit_stmt (gsi_stmt (gsi)); gimple *last = gsi_stmt (gsi); e = NULL; @@ -6783,6 +6783,59 @@ do_unwind (unwind_state *to, int rpo_idx, rpo_elim &avail, int *bb_to_rpo) } } +/* Value-numbering per basic block. */ + +static unsigned +bb_stmts (basic_block bb) +{ + unsigned n_stmts = 0; + + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (gimple_code (stmt) != GIMPLE_DEBUG) + n_stmts++; + } + + return n_stmts; +} + +void +vn_bb_init(basic_block bb) +{ + /* Create the VN state. */ + + unsigned bb_size = bb_stmts (bb); + VN_TOP = create_tmp_var_raw (void_type_node, "vn_top"); + next_value_id = 1; + + vn_ssa_aux_hash = new hash_table <vn_ssa_aux_hasher> (bb_size); + gcc_obstack_init (&vn_ssa_aux_obstack); + + gcc_obstack_init (&vn_tables_obstack); + gcc_obstack_init (&vn_tables_insert_obstack); + valid_info = XCNEW (struct vn_tables_s); + allocate_vn_table (valid_info, bb_size); + last_inserted_ref = NULL; + last_inserted_phi = NULL; + last_inserted_nary = NULL; + + rpo_elim *x = XOBNEW (&vn_ssa_aux_obstack, rpo_elim); + rpo_avail = new(x) rpo_elim (bb); + vn_valueize = rpo_vn_valueize; +} + +void +vn_bb_free () +{ + free_vn_table (valid_info); + XDELETE (valid_info); + obstack_free (&vn_tables_obstack, NULL); + obstack_free (&vn_tables_insert_obstack, NULL); +} + /* Do VN on a SEME region specified by ENTRY and EXIT_BBS in FN. If ITERATE is true then treat backedges optimistically as not executed and iterate. If ELIMINATE is true then perform diff --git a/gcc/tree-ssa-sccvn.h b/gcc/tree-ssa-sccvn.h index 1a5f2389586..8e134446779 100644 --- a/gcc/tree-ssa-sccvn.h +++ b/gcc/tree-ssa-sccvn.h @@ -290,4 +290,8 @@ extern tree (*vn_valueize) (tree); extern basic_block vn_context_bb; +void vn_bb_init (basic_block); +void vn_bb_free (void); +bool vn_visit_stmt (gimple *stmt, bool backedges_varying_p = false); + #endif /* TREE_SSA_SCCVN_H */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index b0cbbac0cb5..199ad7e14a5 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -54,6 +54,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-vector-builder.h" #include "vec-perm-indices.h" #include "tree-eh.h" +#include "tree-ssa-sccvn.h" /* Loop Vectorization Pass. @@ -8608,6 +8609,8 @@ vect_transform_loop (loop_vec_info loop_vinfo) { basic_block bb = bbs[i]; stmt_vec_info stmt_info; + vn_bb_init (bb); + loop_vinfo->cond_to_vec_mask = new cond_vmask_map_type (8); for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) @@ -8717,6 +8720,10 @@ vect_transform_loop (loop_vec_info loop_vinfo) } } } + + delete loop_vinfo->cond_to_vec_mask; + loop_vinfo->cond_to_vec_mask = 0; + vn_bb_free (); } /* BBs in loop */ /* The vectorization factor is always > 1, so if we use an IV increment of 1. diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1e2dfe5d22d..ca985a75cc6 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -55,6 +55,7 @@ along with GCC; see the file COPYING3. If not see #include "gimple-fold.h" #include "regs.h" #include "attribs.h" +#include "tree-ssa-sccvn.h" /* For lang_hooks.types.type_for_mode. */ #include "langhooks.h" @@ -1989,17 +1990,31 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, static tree prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, - gimple_stmt_iterator *gsi) + gimple_stmt_iterator *gsi, tree mask, + cond_vmask_map_type *cond_to_vec_mask) { gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask))); if (!loop_mask) return vec_mask; gcc_assert (TREE_TYPE (loop_mask) == mask_type); + + tree *slot = 0; + if (cond_to_vec_mask) + { + cond_vmask_key cond (mask, loop_mask); + slot = &cond_to_vec_mask->get_or_insert (cond); + if (*slot) + return *slot; + } + tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, vec_mask, loop_mask); gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT); + + if (slot) + *slot = and_res; return and_res; } @@ -3514,8 +3529,10 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, gcc_assert (ncopies == 1); tree mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_out, i); + tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno); vargs[mask_opno] = prepare_load_store_mask - (TREE_TYPE (mask), mask, vargs[mask_opno], gsi); + (TREE_TYPE (mask), mask, vargs[mask_opno], gsi, + scalar_mask, vinfo->cond_to_vec_mask); } gcall *call; @@ -3564,9 +3581,11 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, { tree mask = vect_get_loop_mask (gsi, masks, ncopies, vectype_out, j); + tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno); vargs[mask_opno] = prepare_load_store_mask (TREE_TYPE (mask), mask, - vargs[mask_opno], gsi); + vargs[mask_opno], gsi, + scalar_mask, vinfo->cond_to_vec_mask); } if (cfn == CFN_GOMP_SIMD_LANE) @@ -8109,7 +8128,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); gcall *call; if (final_mask) @@ -8163,7 +8183,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); if (memory_access_type == VMAT_GATHER_SCATTER) { @@ -9304,7 +9325,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); gcall *call; if (final_mask) @@ -9355,7 +9377,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); if (i > 0) dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, @@ -9975,6 +9998,38 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Handle cond expr. */ for (j = 0; j < ncopies; j++) { + tree vec_mask = NULL_TREE; + + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && TREE_CODE_CLASS (TREE_CODE (cond_expr)) == tcc_comparison + && loop_vinfo->cond_to_vec_mask) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + if (masks) + { + tree loop_mask = vect_get_loop_mask (gsi, masks, + ncopies, vectype, j); + + cond_vmask_key cond (cond_expr, loop_mask); + tree *slot = loop_vinfo->cond_to_vec_mask->get (cond); + if (slot && *slot) + vec_mask = *slot; + else + { + cond.cond_ops.code + = invert_tree_comparison (cond.cond_ops.code, true); + slot = loop_vinfo->cond_to_vec_mask->get (cond); + if (slot && *slot) + { + vec_mask = *slot; + tree tmp = then_clause; + then_clause = else_clause; + else_clause = tmp; + } + } + } + } + stmt_vec_info new_stmt_info = NULL; if (j == 0) { @@ -10054,6 +10109,8 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (masked) vec_compare = vec_cond_lhs; + else if (vec_mask) + vec_compare = vec_mask; else { vec_cond_rhs = vec_oprnds1[i]; @@ -10700,6 +10757,9 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, stmt_info)); gimple *stmt = stmt_info->stmt; + if (vinfo->cond_to_vec_mask) + vn_visit_stmt (stmt, true); + switch (STMT_VINFO_TYPE (stmt_info)) { case type_demotion_vec_info_type: diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index dc181524744..065e1467796 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -82,7 +82,6 @@ along with GCC; see the file COPYING3. If not see #include "opt-problem.h" #include "internal-fn.h" - /* Loop or bb location, with hotness information. */ dump_user_location_t vect_location; @@ -461,7 +460,8 @@ vec_info::vec_info (vec_info::vec_kind kind_in, void *target_cost_data_in, vec_info_shared *shared_) : kind (kind_in), shared (shared_), - target_cost_data (target_cost_data_in) + target_cost_data (target_cost_data_in), + cond_to_vec_mask (0) { stmt_vec_infos.create (50); } @@ -1033,7 +1033,6 @@ try_vectorize_loop (hash_table<simduid_to_vf> *&simduid_to_vf_htab, vect_loop_dist_alias_call (loop)); } - /* Function vectorize_loops. Entry point to loop vectorization phase. */ diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1456cde4c2c..87878c81a66 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -26,6 +26,8 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-data-ref.h" #include "tree-hash-traits.h" #include "target.h" +#include "tree-ssa-sccvn.h" +#include "hash-map.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -193,6 +195,92 @@ public: poly_uint64 min_value; }; +struct cond_vmask_key +{ + cond_vmask_key (tree t, tree loop_mask_) + : cond_ops (t), loop_mask (loop_mask_) + {} + + static unsigned get_value_id (tree x) + { + if (TREE_CONSTANT (x)) + return get_or_alloc_constant_value_id (x); + return VN_INFO (x)->value_id; + } + + hashval_t hash () const + { + inchash::hash h; + h.add_int (cond_ops.code); + h.add_int (get_value_id (cond_ops.op0)); + h.add_int (get_value_id (cond_ops.op1)); + h.add_int (SSA_NAME_VERSION (loop_mask)); + return h.end (); + } + + void mark_empty () + { + loop_mask = NULL_TREE; + } + + bool is_empty () + { + return loop_mask == NULL_TREE; + } + + tree_cond_ops cond_ops; + tree loop_mask; +}; + +inline bool operator== (const cond_vmask_key &c1, const cond_vmask_key &c2) +{ + return c1.loop_mask == c2.loop_mask + && c1.cond_ops.code == c2.cond_ops.code + && cond_vmask_key::get_value_id (c1.cond_ops.op0) + == cond_vmask_key::get_value_id (c2.cond_ops.op0) + && cond_vmask_key::get_value_id (c1.cond_ops.op1) + == cond_vmask_key::get_value_id (c2.cond_ops.op1); +} + +struct cond_vmask_key_traits +{ + typedef cond_vmask_key value_type; + typedef cond_vmask_key compare_type; + + static inline hashval_t hash (value_type v) + { + return v.hash (); + } + + static inline bool equal (value_type existing, value_type candidate) + { + return existing == candidate; + } + + static inline void mark_empty (value_type& v) + { + v.mark_empty (); + } + + static inline bool is_empty (value_type v) + { + return v.is_empty (); + } + + static void mark_deleted (value_type&) {} + + static inline bool is_deleted (value_type) + { + return false; + } + + static inline void remove (value_type &) {} +}; + +typedef hash_map<cond_vmask_key, tree, + simple_hashmap_traits <cond_vmask_key_traits, tree> > + cond_vmask_map_type; + /* Vectorizer state shared between different analyses like vector sizes of the same CFG region. */ class vec_info_shared { @@ -255,6 +343,8 @@ public: /* Cost data used by the target cost model. */ void *target_cost_data; + cond_vmask_map_type *cond_to_vec_mask; + private: stmt_vec_info new_stmt_vec_info (gimple *stmt); void set_vinfo_for_stmt (gimple *, stmt_vec_info); diff --git a/gcc/tree.c b/gcc/tree.c index 8f80012c6e8..32a8fcf1eb8 100644 --- a/gcc/tree.c +++ b/gcc/tree.c @@ -15204,6 +15204,44 @@ max_object_size (void) return TYPE_MAX_VALUE (ptrdiff_type_node); } +/* If code(T) is comparison op or def of comparison stmt, + extract it's operands. + Else return <NE_EXPR, T, 0>. */ + +tree_cond_ops::tree_cond_ops (tree t) +{ + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) + { + this->code = TREE_CODE (t); + this->op0 = TREE_OPERAND (t, 0); + this->op1 = TREE_OPERAND (t, 1); + return; + } + + else if (TREE_CODE (t) == SSA_NAME) + { + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)); + if (stmt) + { + tree_code code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison) + { + this->code = code; + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + + this->code = NE_EXPR; + this->op0 = t; + this->op1 = build_zero_cst (TREE_TYPE (t)); + } + + else + gcc_unreachable (); +} + #if CHECKING_P namespace selftest { diff --git a/gcc/tree.h b/gcc/tree.h index 94dbb95a78a..6b9385129a8 100644 --- a/gcc/tree.h +++ b/gcc/tree.h @@ -6141,4 +6141,14 @@ public: operator location_t () const { return m_combined_loc; } }; +struct tree_cond_ops +{ + tree_code code; + tree op0; + tree op1; + + tree_cond_ops (tree); +}; + + #endif /* GCC_TREE_H */