[gomp4] runtime default compute dimensions

Nathan Sidwell Tue, 30 Aug 2016 07:54:52 -0700

This patch interrogates the target device to determine default gemotry atruntime. This has the greatest difference on gang partitioning, where there's anoticeable sawtooth in the relationship between number of gangs and executiontime. Picking the number of gangs as an exact multiple of number of physicalmulti-cpus gets the best performance. Picking one more than that gives a stepincrease in execution time. The sawtooth gets blunter as the multiplicationfactor increases, as one might expect when scheduling smaller and smallerparcels of work onto a limited set of physical cpus.


nathan

2016-08-30  Nathan Sidwell  <nat...@codesourcery.com>


	* plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes
	to determine default geometry.

Index: plugin/plugin-nvptx.c
===================================================================
--- plugin/plugin-nvptx.c	(revision 239862)
+++ plugin/plugin-nvptx.c	(working copy)
@@ -938,14 +938,42 @@ nvptx_exec (void (*fn), size_t mapnum, v
 		}
 	    }
 
-	  /* Do some sanity checking.  The CUDA API doesn't appear to
-	     provide queries to determine these limits.  */
+	  int warp_size, block_size, dev_size, cpu_size;
+	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
+	  /* 32 is the default for known hardware.  */
+	  int gang = 0, worker = 32, vector = 32;
+
+	  if (CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&block_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&dev_size, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&cpu_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev))
+	    {
+	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+				 " dev_size=%d, cpu_size=%d\n",
+				 warp_size, block_size, dev_size, cpu_size);
+	      gang = (cpu_size / block_size) * dev_size;
+	      worker = block_size / warp_size;
+	      vector = warp_size;
+	    }
+
+	  /* There is no upper bound on the gang size.  The best size
+	     matches the hardware configuration.  Logical gangs are
+	     scheduled onto physical hardware.  To maximize usage, we
+	     should guess a large number.  */
 	  if (default_dims[GOMP_DIM_GANG] < 1)
-	    default_dims[GOMP_DIM_GANG] = 32;
+	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
+	  /* The worker size must not exceed the hardware.  */
 	  if (default_dims[GOMP_DIM_WORKER] < 1
-	      || default_dims[GOMP_DIM_WORKER] > 32)
-	    default_dims[GOMP_DIM_WORKER] = 32;
-	  default_dims[GOMP_DIM_VECTOR] = 32;
+	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+	    default_dims[GOMP_DIM_WORKER] = worker;
+	  /* The vector size must exactly match the hardware.  */
+	  if (default_dims[GOMP_DIM_VECTOR] < 1
+	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+	    default_dims[GOMP_DIM_VECTOR] = vector;
 
 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 			     default_dims[GOMP_DIM_GANG],

[gomp4] runtime default compute dimensions

Reply via email to