Apologies -- last-minute attempt to cleanup and enhance broke this patch; fixed version below. The main difference is checking whether we're transforming a loop that might be executed on the target: checking decl->offloadable isn't enough, because target region outlining might not have happened yet; in that case, we need to walk the region tree upwards to check if any containing region is a target region.
Alexander diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index a3c4a90..3189e96 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -142,6 +142,28 @@ expand_ANNOTATE (gcall *) gcc_unreachable (); } +/* Lane index on SIMT targets: thread index in the warp on NVPTX. On targets + without SIMT execution this should be expanded in omp_device_lower pass. */ + +static void +expand_GOMP_SIMT_LANE (gcall *stmt) +{ + tree lhs = gimple_call_lhs (stmt); + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + /* FIXME: use a separate pattern for OpenMP? */ + gcc_assert (targetm.have_oacc_dim_pos ()); + emit_insn (targetm.gen_oacc_dim_pos (target, const2_rtx)); +} + +/* This should get expanded in omp_device_lower pass. */ + +static void +expand_GOMP_SIMT_VF (gcall *) +{ + gcc_unreachable (); +} + /* This should get expanded in adjust_simduid_builtins. */ static void diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 1cb14a8..66c7422 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -41,6 +41,8 @@ along with GCC; see the file COPYING3. If not see DEF_INTERNAL_FN (LOAD_LANES, ECF_CONST | ECF_LEAF, NULL) DEF_INTERNAL_FN (STORE_LANES, ECF_CONST | ECF_LEAF, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index cc0435e..0478b2a 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -10173,7 +10173,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) OMP_CLAUSE_SAFELEN); tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt), OMP_CLAUSE__SIMDUID_); - tree n1, n2; + tree n1, n2, step, simt_lane; type = TREE_TYPE (fd->loop.v); entry_bb = region->entry; @@ -10218,12 +10218,36 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) n1 = fd->loop.n1; n2 = fd->loop.n2; + step = fd->loop.step; + bool offloaded = cgraph_node::get (current_function_decl)->offloadable; + for (struct omp_region *reg = region; !offloaded && reg; reg = reg->outer) + offloaded = reg->type == GIMPLE_OMP_TARGET; + bool do_simt_transform + = offloaded && !broken_loop && !safelen && !simduid && !(fd->collapse > 1); + if (do_simt_transform) + { + simt_lane + = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_LANE, + integer_type_node, 0); + simt_lane = fold_convert (TREE_TYPE (step), simt_lane); + simt_lane = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, simt_lane); + cfun->curr_properties &= ~PROP_gimple_lomp_dev; + } + if (gimple_omp_for_combined_into_p (fd->for_stmt)) { tree innerc = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt), OMP_CLAUSE__LOOPTEMP_); gcc_assert (innerc); n1 = OMP_CLAUSE_DECL (innerc); + if (do_simt_transform) + { + n1 = fold_convert (type, n1); + if (POINTER_TYPE_P (type)) + n1 = fold_build_pointer_plus (n1, simt_lane); + else + n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, simt_lane)); + } innerc = find_omp_clause (OMP_CLAUSE_CHAIN (innerc), OMP_CLAUSE__LOOPTEMP_); gcc_assert (innerc); @@ -10239,8 +10263,15 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) } else { - expand_omp_build_assign (&gsi, fd->loop.v, - fold_convert (type, fd->loop.n1)); + if (do_simt_transform) + { + n1 = fold_convert (type, n1); + if (POINTER_TYPE_P (type)) + n1 = fold_build_pointer_plus (n1, simt_lane); + else + n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, simt_lane)); + } + expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1)); if (fd->collapse > 1) for (i = 0; i < fd->collapse; i++) { @@ -10262,10 +10293,18 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) stmt = gsi_stmt (gsi); gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE); + if (do_simt_transform) + { + tree simt_vf + = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_VF, + integer_type_node, 0); + simt_vf = fold_convert (TREE_TYPE (step), simt_vf); + step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, simt_vf); + } if (POINTER_TYPE_P (type)) - t = fold_build_pointer_plus (fd->loop.v, fd->loop.step); + t = fold_build_pointer_plus (fd->loop.v, step); else - t = fold_build2 (PLUS_EXPR, type, fd->loop.v, fd->loop.step); + t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step); expand_omp_build_assign (&gsi, fd->loop.v, t); if (fd->collapse > 1) @@ -12960,7 +12999,6 @@ expand_omp (struct omp_region *region) } } - /* Helper for build_omp_regions. Scan the dominator tree starting at block BB. PARENT is the region that contains BB. If SINGLE_TREE is true, the function ends once a single tree is built (otherwise, whole @@ -16235,7 +16273,7 @@ const pass_data pass_data_lower_omp = OPTGROUP_NONE, /* optinfo_flags */ TV_NONE, /* tv_id */ PROP_gimple_any, /* properties_required */ - PROP_gimple_lomp, /* properties_provided */ + PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */ 0, /* properties_destroyed */ 0, /* todo_flags_start */ 0, /* todo_flags_finish */ @@ -19470,5 +19508,90 @@ make_pass_oacc_device_lower (gcc::context *ctxt) { return new pass_oacc_device_lower (ctxt); } + + +/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets, + VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and + LANE is kept to be expanded to RTL later on. */ + +static unsigned int +execute_omp_device_lower () +{ + int vf = 1; + if (targetm.simt.vf) + vf = targetm.simt.vf (); + tree vf_tree = build_int_cst (integer_type_node, vf); + basic_block bb; + gimple_stmt_iterator gsi; + FOR_EACH_BB_FN (bb, cfun) + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt)) + continue; + tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE; + switch (gimple_call_internal_fn (stmt)) + { + case IFN_GOMP_SIMT_LANE: + rhs = vf == 1 ? integer_zero_node : NULL_TREE; + break; + case IFN_GOMP_SIMT_VF: + rhs = vf_tree; + break; + default: + break; + } + if (!rhs) + continue; + stmt = gimple_build_assign (lhs, rhs); + gsi_replace (&gsi, stmt, false); + } + if (vf != 1) + cfun->has_force_vectorize_loops = false; + return 0; +} + +namespace { + +const pass_data pass_data_omp_device_lower = +{ + GIMPLE_PASS, /* type */ + "ompdevlow", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + PROP_gimple_lomp_dev, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa, /* todo_flags_finish */ +}; + +class pass_omp_device_lower : public gimple_opt_pass +{ +public: + pass_omp_device_lower (gcc::context *ctxt) + : gimple_opt_pass (pass_data_omp_device_lower, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *fun) + { + /* FIXME: inlining does not propagate the lomp_dev property. */ + return 1 || !(fun->curr_properties & PROP_gimple_lomp_dev); + } + virtual unsigned int execute (function *) + { + return execute_omp_device_lower (); + } + +}; // class pass_expand_omp_ssa + +} // anon namespace + +gimple_opt_pass * +make_pass_omp_device_lower (gcc::context *ctxt) +{ + return new pass_omp_device_lower (ctxt); +} #include "gt-omp-low.h" diff --git a/gcc/passes.def b/gcc/passes.def index c0ab6b9..ec049f8 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -151,6 +151,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_fixup_cfg); NEXT_PASS (pass_lower_eh_dispatch); NEXT_PASS (pass_oacc_device_lower); + NEXT_PASS (pass_omp_device_lower); NEXT_PASS (pass_all_optimizations); PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations) NEXT_PASS (pass_remove_cgraph_callee_edges); diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 49e22a9..71b2561 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -226,6 +226,7 @@ protected: of math functions; the current choices have been optimized. */ +#define PROP_gimple_lomp_dev (1 << 16) /* done omp_device_lower */ #define PROP_trees \ (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp) @@ -414,6 +415,7 @@ extern gimple_opt_pass *make_pass_diagnose_omp_blocks (gcc::context *ctxt); extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt); extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt); extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt); extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt); extern gimple_opt_pass *make_pass_strlen (gcc::context *ctxt); extern gimple_opt_pass *make_pass_fold_builtins (gcc::context *ctxt);