Couple points: * bytewise smem write may be slow? * sync before and after timed operation, otherwise you time who knows what * or, even better, use events.
HTH,
Andreas
On Samstag 02 Januar 2010, Hampton G. Miller wrote:
> I have noticed something which seems odd and which I hope you will look at
> and then tell me if it is something unique to PyCUDA or else is something
> which should be brought to the attention of Nvidia. (Or, that I am just a
> simpleton!)
>
> Looking at my test results, below, and referring to my attached Python
> program with comments, it seems to me that the amount of time taken by
> pycuda.driver.Context.
> synchronize() is strongly a function of the count and type of sram
> accesses. This seems odd to me. Do you agree?
>
> For example, it takes over 13 seconds to sync after doing nothing more than
> writing zeros to (almost) all of the sram bytes for a 512x512 grid!
>
> Regards, Hampton
>
>
> PyCUDA 0.93 running on Mint 7 Linux
>
> Using device GeForce 9800 GT
> gridDim_x gridDim_y blockDim_x blockDim_y blockDim_z
> A B C D E F G H
> 0: 1 1 1 1 1
> 0.001050 0.000120 0.000442 0.000072 0.000257 0.000069 0.000068
> 0.000069
> 1: 1 1 512 1 1
> 0.000828 0.000072 0.000441 0.000073 0.000257 0.000070 0.000069
> 0.000069
> 2: 1 100 512 1 1
> 0.007309 0.000167 0.003026 0.000106 0.001546 0.000072 0.000072
> 0.000072
> 3: 100 1 512 1 1
> 0.005985 0.000077 0.003016 0.000071 0.001543 0.000073 0.000072
> 0.000071
> 4: 100 100 512 1 1
> 0.526857 0.000303 0.263423 0.000302 0.131828 0.000304 0.000311
> 0.000210
> 5: 1 256 512 1 1
> 0.014104 0.000167 0.007073 0.000075 0.003572 0.000076 0.000076
> 0.000073
> 6: 256 1 512 1 1
> 0.014087 0.000081 0.007069 0.000077 0.003570 0.000093 0.000077
> 0.000073
> 7: 256 256 512 1 1
> 3.447902 0.001038 1.724391 0.001039 0.862664 0.001041 0.001586
> 0.000957
> 8: 1 512 512 1 1
> 0.027301 0.000061 0.013667 0.000046 0.006857 0.000045 0.000050
> 0.000044
> 9: 512 1 512 1 1
> 0.027314 0.000125 0.013669 0.000047 0.006855 0.000045 0.000049
> 0.000044
> 10: 512 512 512 1 1
> 13.789054 0.003796 6.896283 0.003800 3.449923 0.003794 0.006229
> 0.003898
> 31.298553 secs total
>
> #!/usr/bin/env python
>
> # nvidia_example.py -
>
> import sys
> import os
> import time
> import numpy
>
> import pycuda.autoinit
> import pycuda.driver as cuda
> from pycuda.compiler import SourceModule
>
>
> gridDim_x = 1
> gridDim_y = 1
>
> blockDim_x = 1
> blockDim_y = 1
> blockDim_z = 1
>
> gridBlockList = [
> (1, 1, 1,1,1), ( 1,1, 512,1,1),
> (1,100, 512,1,1), (100,1, 512,1,1), (100,100, 512,1,1),
> (1,256, 512,1,1), (256,1, 512,1,1), (256,256, 512,1,1),
> (1,512, 512,1,1), (512,1, 512,1,1), (512,512, 512,1,1) ]
>
> #
> ===========================================================================
> =======================
>
> cuda.init()
> device = pycuda.tools.get_default_device()
> print "Using device", device.name()
>
> dev_dataRecords = cuda.mem_alloc( 1024 * 15 )
>
> #
> ---------------------------------------------------------------------------
> -------------------
>
> krnl = SourceModule("""
> __global__ void worker_0 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
> int i;
>
> if ( threadIdx.x == 0 )
> { // Case A:
> for( i=0; i<sizeof(dst); ++i ) // Count = sizeof(dst)
> dst[ i ] = 0; // Type = indexed by i
>
> dst[ 0 ] = dst[ 1 ]; // (Gag "set but never
> used" warning message from compiler)
> };
> }
>
> __global__ void worker_1 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
> int i;
>
> if ( threadIdx.x == 0 )
> { // Case B:
> for( i=0; i<sizeof(dst); ++i ) // Count = sizeof(dst)
> dst[ 0 ] = 0; // Type = always the same
> element, 0
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
>
> __global__ void worker_2 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
> int i;
>
> if ( threadIdx.x == 0 )
> { // Case C:
> for( i=0; i<(sizeof(dst)/2); ++i ) // Count = sizeof(dst)/2
> dst[ i ] = 0; // Type = indexed by i
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
>
> __global__ void worker_3 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
> int i;
>
> if ( threadIdx.x == 0 )
> { // Case D:
> for( i=0; i<(sizeof(dst)/2); ++i ) // Count = sizeof(dst)/2
> dst[ 0 ] = 0; // Type = always the same
> element, 0
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
>
> __global__ void worker_4 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
> int i;
>
> if ( threadIdx.x == 0 )
> { // Case E:
> for( i=0; i<(sizeof(dst)/4); ++i ) // Count = sizeof(dst)/4
> dst[ i ] = 0; // Type = indexed by i
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
>
> __global__ void worker_5 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
> int i;
>
> if ( threadIdx.x == 0 )
> { // Case F:
> for( i=0; i<(sizeof(dst)/4); ++i ) // Count = sizeof(dst)/4
> dst[ 0 ] = 0; // Type = always the same
> element, 0
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
>
> __global__ void worker_6 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
>
> if ( threadIdx.x == 0 )
> { // Case G:
> dst[ 0 ] = 0; // Count = 10
> dst[ 1 ] = 1; // Type = different
> elements
> dst[ 2 ] = 2;
> dst[ 3 ] = 3;
> dst[ 4 ] = 4;
> dst[ 5 ] = 5;
> dst[ 6 ] = 6;
> dst[ 7 ] = 7;
> dst[ 8 ] = 8;
> dst[ 9 ] = 9;
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
>
> __global__ void worker_7 ( char * src )
> {
> __shared__ char dst[ (1024 * 15) ];
>
> if ( threadIdx.x == 0 )
> { // Case H:
> dst[ 0 ] = 0; // Count = 10
> dst[ 0 ] = 1; // Type = always same
> element, 0
> dst[ 0 ] = 2;
> dst[ 0 ] = 3;
> dst[ 0 ] = 4;
> dst[ 0 ] = 5;
> dst[ 0 ] = 6;
> dst[ 0 ] = 7;
> dst[ 0 ] = 8;
> dst[ 0 ] = 9;
>
> dst[ 0 ] = dst[ 1 ];
> };
> }
> """)
>
> worker_0 = krnl.get_function("worker_0")
> worker_1 = krnl.get_function("worker_1")
> worker_2 = krnl.get_function("worker_2")
> worker_3 = krnl.get_function("worker_3")
> worker_4 = krnl.get_function("worker_4")
> worker_5 = krnl.get_function("worker_5")
> worker_6 = krnl.get_function("worker_6")
> worker_7 = krnl.get_function("worker_7")
>
> #
> ===========================================================================
> =======================
>
> print " gridDim_x gridDim_y blockDim_x blockDim_y blockDim_z",
> print "%9s" % "A",
> print "%9s" % "B",
> print "%9s" % "C",
> print "%9s" % "D",
> print "%9s" % "E",
> print "%9s" % "F",
> print "%9s" % "G",
> print "%9s" % "H"
>
> timewas0 = time.time()
> n = 0
> for (gridDim_x, gridDim_y, blockDim_x, blockDim_y, blockDim_z) in
> gridBlockList:
> print "%2u:" % n, ("%12u" * 5) % (gridDim_x, gridDim_y, blockDim_x,
> blockDim_y, blockDim_z),
> sys.stdout.flush()
> for worker in [ worker_0, worker_1, worker_2, worker_3, worker_4,
> worker_5, worker_6, worker_7 ]:
> timewas1 = time.time()
> worker( dev_dataRecords, grid=(gridDim_x, gridDim_y),
> block=(blockDim_x,blockDim_y,blockDim_z) )
> pycuda.driver.Context.synchronize()
> print "%9.6f" % (time.time() - timewas1),
> print
> n += 1
> print "%1.6f secs total" % (time.time() - timewas0)
>
signature.asc
Description: This is a digitally signed message part.
_______________________________________________ PyCUDA mailing list [email protected] http://tiker.net/mailman/listinfo/pycuda_tiker.net
