On 07/02/2018 07:14 AM, Tom de Vries wrote:
> On 06/21/2018 03:58 PM, Cesar Philippidis wrote:
>> On 06/20/2018 03:15 PM, Tom de Vries wrote:
>>> On 06/20/2018 11:59 PM, Cesar Philippidis wrote:
>>>> Now it follows the formula contained in
>>>> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA.
>>>
>>> Any reason we're not using the cuda runtime functions to get the
>>> occupancy (see PR85590 - [nvptx, libgomp, openacc] Use cuda runtime fns
>>> to determine launch configuration in nvptx ) ?
>>
>> There are two reasons:
>>
>>   1) cuda_occupancy.h depends on the CUDA runtime to extract the device
>>      properties instead of the CUDA driver API. However, we can always
>>      teach libgomp how to populate the cudaDeviceProp struct using the
>>      driver API.
>>
>>   2) CUDA is not always present on the build host, and that's why
>>      libgomp maintains its own cuda.h. So at the very least, this
>>      functionality would be good to have in libgomp as a fallback
>>      implementation;
> 
> Libgomp maintains its own cuda.h to "allow building GCC with PTX
> offloading even without CUDA being installed" (
> https://gcc.gnu.org/ml/gcc-patches/2017-01/msg00980.html ).
> 
> The libgomp nvptx plugin however uses the cuda driver API to launch
> kernels etc, so we can assume that's always available at launch time.
> And according to the "CUDA Pro Tip: Occupancy API Simplifies Launch
> Configuration", the occupancy API is also available in the driver API.
> 
> What we cannot assume to be available is the occupancy API pre cuda-6.5.
> So it's fine to have a fallback for that (properly isolated in utility
> functions), but for cuda 6.5 and up we want to use the occupancy API.

Here's revision 2 to the patch. I replaced all of my thread occupancy
heuristics with calls to the CUDA driver as you suggested. The
performance is worse than my heuristics, but that's to be expected
because the CUDA driver only guarantees the minimal launch geometry to
to fully utilize the hardware, and not the optimal value. I'll
reintroduce my heuristics later as a follow up patch. The major
advantage of the CUDA thread occupancy calculator is that it allows the
runtime to select sensible default num_workers to avoid those annoying
runtime failures due to insufficient GPU hardware resources.

One thing that may stick out in this patch is how it probes for the
driver version instead of the API version. It turns out that the API
version corresponds to the SM version declared in the PTX sources,
whereas the driver version corresponds to the latest version of CUDA
supported by the driver. At least that's the case with driver version
396.24.

>>      its not good to have program fail due to
>>      insufficient hardware resources errors when it is avoidable.
>>
> 
> Right, in fact there are two separate things you're trying to address
> here: launch failure and occupancy heuristic, so split the patch.

That hunk was small, so I included it with this patch. Although if you
insist, I can remove it.

Is this patch OK for trunk? I tested it x86_64 with nvptx offloading.

Cesar
2018-07-XX  Cesar Philippidis  <ce...@codesourcery.com>
	    Tom de Vries  <tdevr...@suse.de>

	gcc/
	* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Rename to ...
	(PTX_DEFAULT_RUNTIME_DIM): ... this.
	(nvptx_goacc_validate_dims): Set default worker and gang dims to
	PTX_DEFAULT_RUNTIME_DIM.
	(nvptx_dim_limit): Ignore GOMP_DIM_WORKER;

	libgomp/
	* plugin/cuda/cuda.h (CUoccupancyB2DSize): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Likewise.
	* plugin/plugin-nvptx.c (struct ptx_device): Add driver_version member.
	(nvptx_open_device): Set it.
	(nvptx_exec): Use the CUDA driver to both determine default num_gangs
	and num_workers, and error if the hardware doesn't have sufficient
	resources to launch a kernel.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5608bee8a8d..c1946e75f42 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5165,7 +5165,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
 
 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
 
@@ -5214,9 +5214,9 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
     {
       dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
       if (dims[GOMP_DIM_WORKER] < 0)
-	dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
       if (dims[GOMP_DIM_GANG] < 0)
-	dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
+	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
       changed = true;
     }
 
@@ -5230,9 +5230,6 @@ nvptx_dim_limit (int axis)
 {
   switch (axis)
     {
-    case GOMP_DIM_WORKER:
-      return PTX_WORKER_LENGTH;
-
     case GOMP_DIM_VECTOR:
       return PTX_VECTOR_LENGTH;
 
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..1ee59db172c 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -170,6 +171,9 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSizeWithFlags(int *, int *, CUfunction,
+						   CUoccupancyB2DSize, size_t,
+						   int, unsigned int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 89326e57741..5022e462a3d 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -94,6 +94,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)	\
 CUDA_ONE_CALL (cuModuleLoad)		\
 CUDA_ONE_CALL (cuModuleLoadData)	\
 CUDA_ONE_CALL (cuModuleUnload)		\
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
 CUDA_ONE_CALL (cuStreamCreate)		\
 CUDA_ONE_CALL (cuStreamDestroy)		\
 CUDA_ONE_CALL (cuStreamQuery)		\
@@ -414,6 +415,7 @@ struct ptx_device
   int num_sms;
   int regs_per_block;
   int regs_per_sm;
+  int driver_version;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -725,6 +727,7 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->driver_version = 0;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -806,6 +809,9 @@ nvptx_open_device (int n)
   if (r != CUDA_SUCCESS)
     async_engines = 1;
 
+  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
+  ptx_dev->driver_version = pi;
+
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
@@ -1120,6 +1126,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   void *hp, *dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   const char *maybe_abort_msg = "(perhaps abort was called)";
+  int dev_size = nvthd->ptx_dev->num_sms;
 
   function = targ_fn->fn;
 
@@ -1140,8 +1147,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
   if (seen_zero)
     {
-      /* See if the user provided GOMP_OPENACC_DIM environment
-	 variable to specify runtime defaults. */
+      /* Specify runtime defaults. */
       static int default_dims[GOMP_DIM_MAX];
 
       pthread_mutex_lock (&ptx_dev_lock);
@@ -1150,23 +1156,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
 	    default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 
-	  int warp_size, block_size, dev_size, cpu_size;
+	  int warp_size, block_size, cpu_size;
 	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
 	  /* 32 is the default for known hardware.  */
 	  int gang = 0, worker = 32, vector = 32;
-	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
+	  CUdevice_attribute cu_tpb, cu_ws, cu_tpm;
 
 	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
 	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
-	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
 	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
 
 	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
 				 dev) == CUDA_SUCCESS
 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
 				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
-				    dev) == CUDA_SUCCESS
 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
 				    dev) == CUDA_SUCCESS)
 	    {
@@ -1199,12 +1202,59 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 			     default_dims[GOMP_DIM_VECTOR]);
 	}
       pthread_mutex_unlock (&ptx_dev_lock);
+      int vectors = default_dims[GOMP_DIM_VECTOR];
+      int workers = default_dims[GOMP_DIM_WORKER];
+      int gangs = default_dims[GOMP_DIM_GANG];
+
+      if (nvptx_thread()->ptx_dev->driver_version > 6050)
+	{
+	  int grids, blocks;
+	  CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			    &blocks, function, NULL, 0,
+			    dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	  GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			     "grid = %d, block = %d\n", grids, blocks);
+
+	  gangs = grids * dev_size;
+	  workers = blocks / vectors;
+	}
 
       for (i = 0; i != GOMP_DIM_MAX; i++)
 	if (!dims[i])
-	  dims[i] = default_dims[i];
+	  {
+	    switch (i)
+	      {
+	      case GOMP_DIM_GANG:
+		/* The constant 2 was emperically.  The justification
+		   behind it is to prevent the hardware from idling by
+		   throwing twice the amount of work that it can
+		   physically handle.  */
+		dims[i] = gangs;
+		break;
+	      case GOMP_DIM_WORKER:
+		dims[i] = workers;
+		break;
+	      case GOMP_DIM_VECTOR:
+		dims[i] = vectors;
+		break;
+	      default:
+		abort ();
+	      }
+	  }
     }
 
+  /* Check if the accelerator has sufficient hardware resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
+      > targ_fn->max_threads_per_block)
+    GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
+		       " launch '%s' with num_workers = %d and vector_length ="
+		       " %d; recompile the program with 'num_workers = x and"
+		       " vector_length = y' on that offloaded region or "
+		       "'-fopenacc-dim=-:x:y' where x * y <= %d.\n",
+		       targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+		       dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
+
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */

Reply via email to