Re: [PyCUDA] pycuda.driver.Context.synchronize() delay time is a function of the count and kind of sram accesses?

Andreas Klöckner Sat, 02 Jan 2010 15:24:01 -0800

Couple points:

* bytewise smem write may be slow?
* sync before and after timed operation, otherwise you time who knows what
* or, even better, use events.


HTH,
Andreas


On Samstag 02 Januar 2010, Hampton G. Miller wrote:
> I have noticed something which seems odd and which I hope you will look at
> and then tell me if it is something unique to PyCUDA or else is something
> which should be brought to the attention of Nvidia.  (Or, that I am just a
> simpleton!)
> 
> Looking at my test results, below, and referring to my attached Python
> program with comments, it seems to me that the amount of time taken by
> pycuda.driver.Context.
> synchronize() is strongly a function of the count and type of sram
> accesses.  This seems odd to me.  Do you agree?
> 
> For example, it takes over 13 seconds to sync after doing nothing more than
> writing zeros to (almost) all of the sram bytes for a 512x512 grid!
> 
> Regards, Hampton
> 
> 
> PyCUDA 0.93 running on Mint 7 Linux
> 
>     Using device GeForce 9800 GT
>            gridDim_x   gridDim_y  blockDim_x  blockDim_y  blockDim_z
> A         B         C         D         E         F         G         H
>      0:            1           1           1           1           1
> 0.001050  0.000120  0.000442  0.000072  0.000257  0.000069  0.000068
> 0.000069
>      1:            1           1         512           1           1
> 0.000828  0.000072  0.000441  0.000073  0.000257  0.000070  0.000069
> 0.000069
>      2:            1         100         512           1           1
> 0.007309  0.000167  0.003026  0.000106  0.001546  0.000072  0.000072
> 0.000072
>      3:          100           1         512           1           1
> 0.005985  0.000077  0.003016  0.000071  0.001543  0.000073  0.000072
> 0.000071
>      4:          100         100         512           1           1
> 0.526857  0.000303  0.263423  0.000302  0.131828  0.000304  0.000311
> 0.000210
>      5:            1         256         512           1           1
> 0.014104  0.000167  0.007073  0.000075  0.003572  0.000076  0.000076
> 0.000073
>      6:          256           1         512           1           1
> 0.014087  0.000081  0.007069  0.000077  0.003570  0.000093  0.000077
> 0.000073
>      7:          256         256         512           1           1
> 3.447902  0.001038  1.724391  0.001039  0.862664  0.001041  0.001586
> 0.000957
>      8:            1         512         512           1           1
> 0.027301  0.000061  0.013667  0.000046  0.006857  0.000045  0.000050
> 0.000044
>      9:          512           1         512           1           1
> 0.027314  0.000125  0.013669  0.000047  0.006855  0.000045  0.000049
> 0.000044
>     10:          512         512         512           1           1
> 13.789054  0.003796  6.896283  0.003800  3.449923  0.003794  0.006229
> 0.003898
>     31.298553 secs total
> 
> #!/usr/bin/env python
> 
> # nvidia_example.py -
> 
> import sys
> import os
> import time
> import numpy
> 
> import pycuda.autoinit
> import pycuda.driver as cuda
> from   pycuda.compiler import SourceModule
> 
> 
> gridDim_x  = 1
> gridDim_y  = 1
> 
> blockDim_x = 1
> blockDim_y = 1
> blockDim_z = 1
> 
> gridBlockList = [
>     (1,  1,   1,1,1), (  1,1, 512,1,1),
>     (1,100, 512,1,1), (100,1, 512,1,1), (100,100, 512,1,1),
>     (1,256, 512,1,1), (256,1, 512,1,1), (256,256, 512,1,1),
>     (1,512, 512,1,1), (512,1, 512,1,1), (512,512, 512,1,1) ]
> 
> #
> ===========================================================================
> =======================
> 
> cuda.init()
> device = pycuda.tools.get_default_device()
> print "Using device", device.name()
> 
> dev_dataRecords = cuda.mem_alloc( 1024 * 15 )
> 
> #
> ---------------------------------------------------------------------------
> -------------------
> 
> krnl = SourceModule("""
> __global__ void worker_0 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
>     int i;
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case A:
>         for( i=0; i<sizeof(dst); ++i )            // Count = sizeof(dst)
>             dst[ i ] = 0;                        // Type = indexed by i
> 
>         dst[ 0 ] = dst[ 1 ];                    // (Gag "set but never
>  used" warning message from compiler)
>     };
> }
> 
> __global__ void worker_1 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
>     int i;
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case B:
>         for( i=0; i<sizeof(dst); ++i )            // Count = sizeof(dst)
>             dst[ 0 ] = 0;                        // Type = always the same
> element, 0
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> 
> __global__ void worker_2 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
>     int i;
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case C:
>         for( i=0; i<(sizeof(dst)/2); ++i )        // Count = sizeof(dst)/2
>             dst[ i ] = 0;                        // Type = indexed by i
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> 
> __global__ void worker_3 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
>     int i;
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case D:
>         for( i=0; i<(sizeof(dst)/2); ++i )        // Count = sizeof(dst)/2
>             dst[ 0 ] = 0;                        // Type = always the same
> element, 0
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> 
> __global__ void worker_4 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
>     int i;
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case E:
>         for( i=0; i<(sizeof(dst)/4); ++i )        // Count = sizeof(dst)/4
>             dst[ i ] = 0;                        // Type = indexed by i
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> 
> __global__ void worker_5 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
>     int i;
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case F:
>         for( i=0; i<(sizeof(dst)/4); ++i )        // Count = sizeof(dst)/4
>             dst[ 0 ] = 0;                        // Type = always the same
> element, 0
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> 
> __global__ void worker_6 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case G:
>         dst[ 0 ] = 0;                            // Count = 10
>         dst[ 1 ] = 1;                            // Type = different
> elements
>         dst[ 2 ] = 2;
>         dst[ 3 ] = 3;
>         dst[ 4 ] = 4;
>         dst[ 5 ] = 5;
>         dst[ 6 ] = 6;
>         dst[ 7 ] = 7;
>         dst[ 8 ] = 8;
>         dst[ 9 ] = 9;
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> 
> __global__ void worker_7 ( char * src )
> {
>     __shared__ char dst[ (1024 * 15) ];
> 
>     if ( threadIdx.x == 0 )
>     {                                            // Case H:
>         dst[ 0 ] = 0;                            // Count = 10
>         dst[ 0 ] = 1;                            // Type = always same
> element, 0
>         dst[ 0 ] = 2;
>         dst[ 0 ] = 3;
>         dst[ 0 ] = 4;
>         dst[ 0 ] = 5;
>         dst[ 0 ] = 6;
>         dst[ 0 ] = 7;
>         dst[ 0 ] = 8;
>         dst[ 0 ] = 9;
> 
>         dst[ 0 ] = dst[ 1 ];
>     };
> }
> """)
> 
> worker_0 = krnl.get_function("worker_0")
> worker_1 = krnl.get_function("worker_1")
> worker_2 = krnl.get_function("worker_2")
> worker_3 = krnl.get_function("worker_3")
> worker_4 = krnl.get_function("worker_4")
> worker_5 = krnl.get_function("worker_5")
> worker_6 = krnl.get_function("worker_6")
> worker_7 = krnl.get_function("worker_7")
> 
> #
> ===========================================================================
> =======================
> 
> print "       gridDim_x   gridDim_y  blockDim_x  blockDim_y  blockDim_z",
> print "%9s" % "A",
> print "%9s" % "B",
> print "%9s" % "C",
> print "%9s" % "D",
> print "%9s" % "E",
> print "%9s" % "F",
> print "%9s" % "G",
> print "%9s" % "H"
> 
> timewas0 = time.time()
> n = 0
> for (gridDim_x, gridDim_y, blockDim_x, blockDim_y, blockDim_z) in
> gridBlockList:
>     print "%2u:" % n, ("%12u" * 5) % (gridDim_x, gridDim_y, blockDim_x,
> blockDim_y, blockDim_z),
>     sys.stdout.flush()
>     for worker in [ worker_0, worker_1, worker_2, worker_3, worker_4,
> worker_5, worker_6, worker_7 ]:
>         timewas1 = time.time()
>         worker( dev_dataRecords, grid=(gridDim_x, gridDim_y),
> block=(blockDim_x,blockDim_y,blockDim_z) )
>         pycuda.driver.Context.synchronize()
>         print "%9.6f" % (time.time() - timewas1),
>     print
>     n += 1
> print "%1.6f secs total" % (time.time() - timewas0)
>

signature.asc
Description: This is a digitally signed message part.

_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net

Re: [PyCUDA] pycuda.driver.Context.synchronize() delay time is a function of the count and kind of sram accesses?

Reply via email to