This patch interrogates the target device to determine default gemotry at
runtime. This has the greatest difference on gang partitioning, where there's a
noticeable sawtooth in the relationship between number of gangs and execution
time. Picking the number of gangs as an exact multiple of number of physical
multi-cpus gets the best performance. Picking one more than that gives a step
increase in execution time. The sawtooth gets blunter as the multiplication
factor increases, as one might expect when scheduling smaller and smaller
parcels of work onto a limited set of physical cpus.
nathan
2016-08-30 Nathan Sidwell <nat...@codesourcery.com>
* plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes
to determine default geometry.
Index: plugin/plugin-nvptx.c
===================================================================
--- plugin/plugin-nvptx.c (revision 239862)
+++ plugin/plugin-nvptx.c (working copy)
@@ -938,14 +938,42 @@ nvptx_exec (void (*fn), size_t mapnum, v
}
}
- /* Do some sanity checking. The CUDA API doesn't appear to
- provide queries to determine these limits. */
+ int warp_size, block_size, dev_size, cpu_size;
+ CUdevice dev = nvptx_thread()->ptx_dev->dev;
+ /* 32 is the default for known hardware. */
+ int gang = 0, worker = 32, vector = 32;
+
+ if (CUDA_SUCCESS == cuDeviceGetAttribute
+ (&block_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev)
+ && CUDA_SUCCESS == cuDeviceGetAttribute
+ (&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev)
+ && CUDA_SUCCESS == cuDeviceGetAttribute
+ (&dev_size, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)
+ && CUDA_SUCCESS == cuDeviceGetAttribute
+ (&cpu_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev))
+ {
+ GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+ " dev_size=%d, cpu_size=%d\n",
+ warp_size, block_size, dev_size, cpu_size);
+ gang = (cpu_size / block_size) * dev_size;
+ worker = block_size / warp_size;
+ vector = warp_size;
+ }
+
+ /* There is no upper bound on the gang size. The best size
+ matches the hardware configuration. Logical gangs are
+ scheduled onto physical hardware. To maximize usage, we
+ should guess a large number. */
if (default_dims[GOMP_DIM_GANG] < 1)
- default_dims[GOMP_DIM_GANG] = 32;
+ default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
+ /* The worker size must not exceed the hardware. */
if (default_dims[GOMP_DIM_WORKER] < 1
- || default_dims[GOMP_DIM_WORKER] > 32)
- default_dims[GOMP_DIM_WORKER] = 32;
- default_dims[GOMP_DIM_VECTOR] = 32;
+ || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+ default_dims[GOMP_DIM_WORKER] = worker;
+ /* The vector size must exactly match the hardware. */
+ if (default_dims[GOMP_DIM_VECTOR] < 1
+ || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+ default_dims[GOMP_DIM_VECTOR] = vector;
GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
default_dims[GOMP_DIM_GANG],