This reverts commit 31e53aef12f574a8534f8aea219b5466edb75b32. Re-measuring the effect of this patch across a set of benchmarks shows that it appears to have an overall slightly negative effect on performance. Thus, we are reverting it. --- gcc/ChangeLog.omp | 11 ++ gcc/config/nvptx/nvptx.c | 229 ---------------------------------- libgomp/ChangeLog.omp | 13 ++ libgomp/plugin/plugin-nvptx.c | 163 +++++++++++++++++++++--- 4 files changed, 173 insertions(+), 243 deletions(-)
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index 88754626121..b97c1acaf49 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,14 @@ +2021-10-12 Julian Brown <jul...@codesourcery.com> + + Revert: + + 2019-09-10 Chung-Lin Tang <clt...@codesourcery.com> + + * config/nvptx/nvptx.c (nvptx_expand_to_rtl_hook): New function + implementing CUDA .params space transformation. + (TARGET_EXPAND_TO_RTL_HOOK): implement hook with + nvptx_expand_to_rtl_hook. + 2021-10-12 Tobias Burnus <tob...@codesourcery.com> Backported from master: diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 143be974fe8..e23c3902306 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -68,10 +68,6 @@ #include "attribs.h" #include "tree-vrp.h" #include "tree-ssa-operands.h" -#include "tree-pretty-print.h" -#include "gimple-pretty-print.h" -#include "tree-cfg.h" -#include "gimple-ssa.h" #include "tree-ssanames.h" #include "gimplify.h" #include "tree-phinodes.h" @@ -6339,228 +6335,6 @@ nvptx_libc_has_function (enum function_class fn_class, tree type) return default_libc_has_function (fn_class, type); } -static void -nvptx_expand_to_rtl_hook (void) -{ - /* For utilizing CUDA .param kernel arguments, we detect and modify - the gimple of offloaded child functions, here before RTL expansion, - starting with standard OMP form: - foo._omp_fn.0 (const struct .omp_data_t.8 & restrict .omp_data_i) { ... } - - and transform it into a style where the OMP data record fields are - "exploded" into individual scalar arguments: - foo._omp_fn.0 (int * a, int * b, int * c) { ... } - - Note that there are implicit assumptions of how OMP lowering (and/or other - intervening passes) behaves contained in this transformation code; - if those passes change in their output, this code may possibly need - updating. */ - - if (lookup_attribute ("omp target entrypoint", - DECL_ATTRIBUTES (current_function_decl)) - /* The rather indirect manner in which OpenMP target functions are - launched makes this transformation only valid for OpenACC currently. - TODO: e.g. write_omp_entry(), nvptx_declare_function_name(), etc. - needs changes for this to work with OpenMP. */ - && lookup_attribute ("oacc function", - DECL_ATTRIBUTES (current_function_decl)) - && VOID_TYPE_P (TREE_TYPE (DECL_RESULT (current_function_decl)))) - { - tree omp_data_arg = DECL_ARGUMENTS (current_function_decl); - tree argtype = TREE_TYPE (omp_data_arg); - - /* Ensure this function is of the form of a single reference argument - to the OMP data record, or a single void* argument (when no values - passed) */ - if (! (DECL_CHAIN (omp_data_arg) == NULL_TREE - && ((TREE_CODE (argtype) == REFERENCE_TYPE - && TREE_CODE (TREE_TYPE (argtype)) == RECORD_TYPE) - || (TREE_CODE (argtype) == POINTER_TYPE - && TREE_TYPE (argtype) == void_type_node)))) - return; - - if (dump_file) - { - fprintf (dump_file, "Detected offloaded child function %s, " - "starting parameter conversion\n", - print_generic_expr_to_str (current_function_decl)); - fprintf (dump_file, "OMP data record argument: %s (tree type: %s)\n", - print_generic_expr_to_str (omp_data_arg), - print_generic_expr_to_str (argtype)); - fprintf (dump_file, "Data record fields:\n"); - } - - hash_map<tree,tree> fld_to_args; - tree fld, rectype = TREE_TYPE (argtype); - tree arglist = NULL_TREE, argtypelist = NULL_TREE; - - if (TREE_CODE (rectype) == RECORD_TYPE) - { - /* For each field in the OMP data record type, create a corresponding - PARM_DECL, and map field -> parm using the fld_to_args hash_map. - Also create the tree chains for creating function type and - DECL_ARGUMENTS below. */ - for (fld = TYPE_FIELDS (rectype); fld; fld = DECL_CHAIN (fld)) - { - tree narg = build_decl (DECL_SOURCE_LOCATION (fld), PARM_DECL, - DECL_NAME (fld), TREE_TYPE (fld)); - DECL_ARTIFICIAL (narg) = 1; - DECL_ARG_TYPE (narg) = TREE_TYPE (fld); - DECL_CONTEXT (narg) = current_function_decl; - TREE_USED (narg) = 1; - TREE_READONLY (narg) = 1; - - if (dump_file) - fprintf (dump_file, "\t%s, type: %s, offset: %s bytes + %s bits\n", - print_generic_expr_to_str (fld), - print_generic_expr_to_str (TREE_TYPE (fld)), - print_generic_expr_to_str (DECL_FIELD_OFFSET (fld)), - print_generic_expr_to_str (DECL_FIELD_BIT_OFFSET (fld))); - fld_to_args.put (fld, narg); - - TREE_CHAIN (narg) = arglist; - arglist = narg; - argtypelist = tree_cons (NULL_TREE, TREE_TYPE (narg), - argtypelist); - } - arglist = nreverse (arglist); - argtypelist = nreverse (argtypelist); - } - /* This is needed to not be mistaken for a stdarg function. */ - argtypelist = chainon (argtypelist, void_list_node); - - if (dump_file) - { - fprintf (dump_file, "Function before OMP data arg replaced:\n"); - dump_function_to_file (current_function_decl, dump_file, dump_flags); - } - - /* Actually modify the tree type and DECL_ARGUMENTS here. */ - TREE_TYPE (current_function_decl) = build_function_type (void_type_node, - argtypelist); - DECL_ARGUMENTS (current_function_decl) = arglist; - - /* Remove local decls which correspond to *.omp_data_i->FIELD entries, by - scanning and skipping those entries, creating a new local_decls list. - We assume a very specific MEM_REF tree expression shape. */ - tree decl; - unsigned int i; - vec<tree, va_gc> *new_local_decls = NULL; - FOR_EACH_VEC_SAFE_ELT (cfun->local_decls, i, decl) - { - if (DECL_HAS_VALUE_EXPR_P (decl)) - { - tree t = DECL_VALUE_EXPR (decl); - if (TREE_CODE (t) == MEM_REF - && TREE_CODE (TREE_OPERAND (t, 0)) == COMPONENT_REF - && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t, 0), 0)) == MEM_REF - && (TREE_OPERAND (TREE_OPERAND (TREE_OPERAND (t, 0), 0), 0) - == omp_data_arg)) - continue; - } - vec_safe_push (new_local_decls, decl); - } - vec_free (cfun->local_decls); - cfun->local_decls = new_local_decls; - - /* Scan function body for assignments from .omp_data_i->FIELD, and using - the above created fld_to_args hash map, convert them to reads of - function arguments. */ - basic_block bb; - gimple_stmt_iterator gsi; - FOR_EACH_BB_FN (bb, cfun) - for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) - { - tree val, *val_ptr = NULL; - gimple *stmt = gsi_stmt (gsi); - if (is_gimple_assign (stmt) - && gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS) - val_ptr = gimple_assign_rhs1_ptr (stmt); - else if (is_gimple_debug (stmt) && gimple_debug_bind_p (stmt)) - val_ptr = gimple_debug_bind_get_value_ptr (stmt); - - if (val_ptr == NULL || (val = *val_ptr) == NULL_TREE) - continue; - - tree new_val = NULL_TREE, fld = NULL_TREE; - - if (TREE_CODE (val) == COMPONENT_REF - && TREE_CODE (TREE_OPERAND (val, 0)) == MEM_REF - && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (val, 0), 0)) - == SSA_NAME) - && (SSA_NAME_VAR (TREE_OPERAND (TREE_OPERAND (val, 0), 0)) - == omp_data_arg)) - { - /* .omp_data->FIELD case. */ - fld = TREE_OPERAND (val, 1); - new_val = *fld_to_args.get (fld); - } - else if (TREE_CODE (val) == MEM_REF - && TREE_CODE (TREE_OPERAND (val, 0)) == SSA_NAME - && SSA_NAME_VAR (TREE_OPERAND (val, 0)) == omp_data_arg) - { - /* This case may happen in the final tree level optimization - output, due to SLP: - vect.XX = MEM <vector(1) unsigned long> [(void *).omp_data_i_5(D) + 8B] - - Therefore here we need a more elaborate search of the field - list to reverse map to which field the offset is referring - to. */ - unsigned HOST_WIDE_INT offset - = tree_to_uhwi (TREE_OPERAND (val, 1)); - - for (hash_map<tree, tree>::iterator i = fld_to_args.begin (); - i != fld_to_args.end (); ++i) - { - tree cur_fld = (*i).first; - tree cur_arg = (*i).second; - gcc_assert (TREE_CODE (cur_arg) == PARM_DECL); - - unsigned HOST_WIDE_INT cur_offset = - (tree_to_uhwi (DECL_FIELD_OFFSET (cur_fld)) - + (tree_to_uhwi (DECL_FIELD_BIT_OFFSET (cur_fld)) - / BITS_PER_UNIT)); - - if (offset == cur_offset) - { - new_val = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (val), - cur_arg); - break; - } - } - } - - /* If we found the corresponding OMP data record field, replace the - RHS with the new created PARM_DECL. */ - if (new_val != NULL_TREE) - { - if (dump_file) - { - fprintf (dump_file, "For gimple stmt: "); - print_gimple_stmt (dump_file, stmt, 0); - fprintf (dump_file, "\tReplacing OMP recv ref %s with %s\n", - print_generic_expr_to_str (val), - print_generic_expr_to_str (new_val)); - } - /* Write in looked up ARG as new RHS value. */ - *val_ptr = new_val; - } - } - - /* Delete SSA_NAMEs of .omp_data_i by setting them to NULL_TREE. */ - tree name; - FOR_EACH_SSA_NAME (i, name, cfun) - if (SSA_NAME_VAR (name) == omp_data_arg) - (*SSANAMES (cfun))[SSA_NAME_VERSION (name)] = NULL_TREE; - - if (dump_file) - { - fprintf (dump_file, "Function after OMP data arg replaced: "); - dump_function_to_file (current_function_decl, dump_file, dump_flags); - } - } -} - #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override @@ -6715,9 +6489,6 @@ nvptx_expand_to_rtl_hook (void) #undef TARGET_LIBC_HAS_FUNCTION #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function -#undef TARGET_EXPAND_TO_RTL_HOOK -#define TARGET_EXPAND_TO_RTL_HOOK nvptx_expand_to_rtl_hook - struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-nvptx.h" diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index b39fe5dd2a4..7e8e35bb5fa 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,16 @@ +2021-10-12 Julian Brown <jul...@codesourcery.com> + + Revert: + + 2019-09-10 Chung-Lin Tang <clt...@codesourcery.com> + + * plugin/plugin-nvptx.c (nvptx_exec): Adjust arguments, add + kernel argument setup code, adjust cuLaunchKernel calling code. + (GOMP_OFFLOAD_openacc_exec): Adjust nvptx_exec call, delete + profiling code. + (GOMP_OFFLOAD_openacc_async_exec): Likewise. + (cuda_free_argmem): Delete function. + 2021-10-12 Tobias Burnus <tob...@codesourcery.com> Backported from master: diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a9680962f51..f1b0d8d4fbd 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -731,24 +731,16 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, static void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, - unsigned *dims, CUstream stream) + unsigned *dims, void *targ_mem_desc, + CUdeviceptr dp, CUstream stream) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; int i; + void *kargs[1]; struct nvptx_thread *nvthd = nvptx_thread (); int warp_size = nvthd->ptx_dev->warp_size; - void **kernel_args = NULL; - GOMP_PLUGIN_debug (0, "prepare mappings (mapnum: %u)\n", (unsigned) mapnum); - - if (mapnum > 0) - { - kernel_args = alloca (mapnum * sizeof (void *)); - for (int i = 0; i < mapnum; i++) - kernel_args[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]); - } - function = targ_fn->fn; /* Initialize the launch dimensions. Typically this is constant, @@ -980,10 +972,11 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, api_info); } + kargs[0] = &dp; CUDA_CALL_ASSERT (cuLaunchKernel, function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, - 0, stream, kernel_args, 0); + 0, stream, kargs, 0); if (profiling_p) { @@ -1463,7 +1456,66 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, void **hostaddrs, void **devaddrs, unsigned *dims, void *targ_mem_desc) { - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, NULL); + GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); + + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + bool profiling_p = __builtin_expect (prof_info != NULL, false); + + void **hp = NULL; + CUdeviceptr dp = 0; + + if (mapnum > 0) + { + size_t s = mapnum * sizeof (void *); + hp = alloca (s); + for (int i = 0; i < mapnum; i++) + hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); + CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); + if (profiling_p) + goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); + } + + /* Copy the (device) pointers to arguments to the device (dp and hp might in + fact have the same value on a unified-memory system). */ + if (mapnum > 0) + { + if (profiling_p) + { + prof_info->event_type = acc_ev_enqueue_upload_start; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; + data_event_info.data_event.implicit = 1; /* Always implicit. */ + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; + data_event_info.data_event.bytes = mapnum * sizeof (void *); + data_event_info.data_event.host_ptr = hp; + data_event_info.data_event.device_ptr = (const void *) dp; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp, + mapnum * sizeof (void *)); + if (profiling_p) + { + prof_info->event_type = acc_ev_enqueue_upload_end; + data_event_info.data_event.event_type = prof_info->event_type; + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + } + + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, + dp, NULL); CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL); const char *maybe_abort_msg = "(perhaps abort was called)"; @@ -1472,6 +1524,18 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); + + CUDA_CALL_ASSERT (cuMemFree, dp); + if (profiling_p) + goacc_profiling_acc_ev_free (thr, (void *) dp); +} + +static void +cuda_free_argmem (void *ptr) +{ + void **block = (void **) ptr; + nvptx_free (block[0], (struct ptx_device *) block[1]); + free (block); } void @@ -1480,7 +1544,78 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, unsigned *dims, void *targ_mem_desc, struct goacc_asyncqueue *aq) { - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, aq->cuda_stream); + GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); + + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + bool profiling_p = __builtin_expect (prof_info != NULL, false); + + void **hp = NULL; + CUdeviceptr dp = 0; + void **block = NULL; + + if (mapnum > 0) + { + size_t s = mapnum * sizeof (void *); + block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s); + hp = block + 2; + for (int i = 0; i < mapnum; i++) + hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); + CUDA_CALL_ASSERT (cuMemAlloc, &dp, s); + if (profiling_p) + goacc_profiling_acc_ev_alloc (thr, (void *) dp, s); + } + + /* Copy the (device) pointers to arguments to the device (dp and hp might in + fact have the same value on a unified-memory system). */ + if (mapnum > 0) + { + if (profiling_p) + { + prof_info->event_type = acc_ev_enqueue_upload_start; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; + data_event_info.data_event.implicit = 1; /* Always implicit. */ + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; + data_event_info.data_event.bytes = mapnum * sizeof (void *); + data_event_info.data_event.host_ptr = hp; + data_event_info.data_event.device_ptr = (const void *) dp; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + + CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp, + mapnum * sizeof (void *), aq->cuda_stream); + block[0] = (void *) dp; + + struct nvptx_thread *nvthd = + (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); + block[1] = (void *) nvthd->ptx_dev; + + if (profiling_p) + { + prof_info->event_type = acc_ev_enqueue_upload_end; + data_event_info.data_event.event_type = prof_info->event_type; + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + } + + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, + dp, aq->cuda_stream); + + if (mapnum > 0) + GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block); } void * -- 2.29.2