I have noticed something which seems odd and which I hope you will look at
and then tell me if it is something unique to PyCUDA or else is something
which should be brought to the attention of Nvidia. (Or, that I am just a
simpleton!)
Looking at my test results, below, and referring to my attached Python
program with comments, it seems to me that the amount of time taken by
pycuda.driver.Context.
synchronize() is strongly a function of the count and type of sram
accesses. This seems odd to me. Do you agree?
For example, it takes over 13 seconds to sync after doing nothing more than
writing zeros to (almost) all of the sram bytes for a 512x512 grid!
Regards, Hampton
PyCUDA 0.93 running on Mint 7 Linux
Using device GeForce 9800 GT
gridDim_x gridDim_y blockDim_x blockDim_y blockDim_z
A B C D E F G H
0: 1 1 1 1 1
0.001050 0.000120 0.000442 0.000072 0.000257 0.000069 0.000068
0.000069
1: 1 1 512 1 1
0.000828 0.000072 0.000441 0.000073 0.000257 0.000070 0.000069
0.000069
2: 1 100 512 1 1
0.007309 0.000167 0.003026 0.000106 0.001546 0.000072 0.000072
0.000072
3: 100 1 512 1 1
0.005985 0.000077 0.003016 0.000071 0.001543 0.000073 0.000072
0.000071
4: 100 100 512 1 1
0.526857 0.000303 0.263423 0.000302 0.131828 0.000304 0.000311
0.000210
5: 1 256 512 1 1
0.014104 0.000167 0.007073 0.000075 0.003572 0.000076 0.000076
0.000073
6: 256 1 512 1 1
0.014087 0.000081 0.007069 0.000077 0.003570 0.000093 0.000077
0.000073
7: 256 256 512 1 1
3.447902 0.001038 1.724391 0.001039 0.862664 0.001041 0.001586
0.000957
8: 1 512 512 1 1
0.027301 0.000061 0.013667 0.000046 0.006857 0.000045 0.000050
0.000044
9: 512 1 512 1 1
0.027314 0.000125 0.013669 0.000047 0.006855 0.000045 0.000049
0.000044
10: 512 512 512 1 1
13.789054 0.003796 6.896283 0.003800 3.449923 0.003794 0.006229
0.003898
31.298553 secs total
#!/usr/bin/env python
# nvidia_example.py -
import sys
import os
import time
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
gridDim_x = 1
gridDim_y = 1
blockDim_x = 1
blockDim_y = 1
blockDim_z = 1
gridBlockList = [
(1, 1, 1,1,1), ( 1,1, 512,1,1),
(1,100, 512,1,1), (100,1, 512,1,1), (100,100, 512,1,1),
(1,256, 512,1,1), (256,1, 512,1,1), (256,256, 512,1,1),
(1,512, 512,1,1), (512,1, 512,1,1), (512,512, 512,1,1) ]
#
==================================================================================================
cuda.init()
device = pycuda.tools.get_default_device()
print "Using device", device.name()
dev_dataRecords = cuda.mem_alloc( 1024 * 15 )
#
----------------------------------------------------------------------------------------------
krnl = SourceModule("""
__global__ void worker_0 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
int i;
if ( threadIdx.x == 0 )
{ // Case A:
for( i=0; i<sizeof(dst); ++i ) // Count = sizeof(dst)
dst[ i ] = 0; // Type = indexed by i
dst[ 0 ] = dst[ 1 ]; // (Gag "set but never used"
warning message from compiler)
};
}
__global__ void worker_1 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
int i;
if ( threadIdx.x == 0 )
{ // Case B:
for( i=0; i<sizeof(dst); ++i ) // Count = sizeof(dst)
dst[ 0 ] = 0; // Type = always the same
element, 0
dst[ 0 ] = dst[ 1 ];
};
}
__global__ void worker_2 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
int i;
if ( threadIdx.x == 0 )
{ // Case C:
for( i=0; i<(sizeof(dst)/2); ++i ) // Count = sizeof(dst)/2
dst[ i ] = 0; // Type = indexed by i
dst[ 0 ] = dst[ 1 ];
};
}
__global__ void worker_3 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
int i;
if ( threadIdx.x == 0 )
{ // Case D:
for( i=0; i<(sizeof(dst)/2); ++i ) // Count = sizeof(dst)/2
dst[ 0 ] = 0; // Type = always the same
element, 0
dst[ 0 ] = dst[ 1 ];
};
}
__global__ void worker_4 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
int i;
if ( threadIdx.x == 0 )
{ // Case E:
for( i=0; i<(sizeof(dst)/4); ++i ) // Count = sizeof(dst)/4
dst[ i ] = 0; // Type = indexed by i
dst[ 0 ] = dst[ 1 ];
};
}
__global__ void worker_5 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
int i;
if ( threadIdx.x == 0 )
{ // Case F:
for( i=0; i<(sizeof(dst)/4); ++i ) // Count = sizeof(dst)/4
dst[ 0 ] = 0; // Type = always the same
element, 0
dst[ 0 ] = dst[ 1 ];
};
}
__global__ void worker_6 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
if ( threadIdx.x == 0 )
{ // Case G:
dst[ 0 ] = 0; // Count = 10
dst[ 1 ] = 1; // Type = different
elements
dst[ 2 ] = 2;
dst[ 3 ] = 3;
dst[ 4 ] = 4;
dst[ 5 ] = 5;
dst[ 6 ] = 6;
dst[ 7 ] = 7;
dst[ 8 ] = 8;
dst[ 9 ] = 9;
dst[ 0 ] = dst[ 1 ];
};
}
__global__ void worker_7 ( char * src )
{
__shared__ char dst[ (1024 * 15) ];
if ( threadIdx.x == 0 )
{ // Case H:
dst[ 0 ] = 0; // Count = 10
dst[ 0 ] = 1; // Type = always same
element, 0
dst[ 0 ] = 2;
dst[ 0 ] = 3;
dst[ 0 ] = 4;
dst[ 0 ] = 5;
dst[ 0 ] = 6;
dst[ 0 ] = 7;
dst[ 0 ] = 8;
dst[ 0 ] = 9;
dst[ 0 ] = dst[ 1 ];
};
}
""")
worker_0 = krnl.get_function("worker_0")
worker_1 = krnl.get_function("worker_1")
worker_2 = krnl.get_function("worker_2")
worker_3 = krnl.get_function("worker_3")
worker_4 = krnl.get_function("worker_4")
worker_5 = krnl.get_function("worker_5")
worker_6 = krnl.get_function("worker_6")
worker_7 = krnl.get_function("worker_7")
#
==================================================================================================
print " gridDim_x gridDim_y blockDim_x blockDim_y blockDim_z",
print "%9s" % "A",
print "%9s" % "B",
print "%9s" % "C",
print "%9s" % "D",
print "%9s" % "E",
print "%9s" % "F",
print "%9s" % "G",
print "%9s" % "H"
timewas0 = time.time()
n = 0
for (gridDim_x, gridDim_y, blockDim_x, blockDim_y, blockDim_z) in
gridBlockList:
print "%2u:" % n, ("%12u" * 5) % (gridDim_x, gridDim_y, blockDim_x,
blockDim_y, blockDim_z),
sys.stdout.flush()
for worker in [ worker_0, worker_1, worker_2, worker_3, worker_4,
worker_5, worker_6, worker_7 ]:
timewas1 = time.time()
worker( dev_dataRecords, grid=(gridDim_x, gridDim_y),
block=(blockDim_x,blockDim_y,blockDim_z) )
pycuda.driver.Context.synchronize()
print "%9.6f" % (time.time() - timewas1),
print
n += 1
print "%1.6f secs total" % (time.time() - timewas0)
_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net