[PyCUDA] pycuda.driver.Context.synchronize() delay time is a function of the count and kind of sram accesses?

Hampton G. Miller Sat, 02 Jan 2010 14:33:38 -0800

I have noticed something which seems odd and which I hope you will look at
and then tell me if it is something unique to PyCUDA or else is something
which should be brought to the attention of Nvidia.  (Or, that I am just a
simpleton!)


Looking at my test results, below, and referring to my attached Python
program with comments, it seems to me that the amount of time taken by
pycuda.driver.Context.
synchronize() is strongly a function of the count and type of sram
accesses.  This seems odd to me.  Do you agree?

For example, it takes over 13 seconds to sync after doing nothing more than
writing zeros to (almost) all of the sram bytes for a 512x512 grid!

Regards, Hampton


PyCUDA 0.93 running on Mint 7 Linux

    Using device GeForce 9800 GT
           gridDim_x   gridDim_y  blockDim_x  blockDim_y  blockDim_z
A         B         C         D         E         F         G         H
     0:            1           1           1           1           1
0.001050  0.000120  0.000442  0.000072  0.000257  0.000069  0.000068
0.000069
     1:            1           1         512           1           1
0.000828  0.000072  0.000441  0.000073  0.000257  0.000070  0.000069
0.000069
     2:            1         100         512           1           1
0.007309  0.000167  0.003026  0.000106  0.001546  0.000072  0.000072
0.000072
     3:          100           1         512           1           1
0.005985  0.000077  0.003016  0.000071  0.001543  0.000073  0.000072
0.000071
     4:          100         100         512           1           1
0.526857  0.000303  0.263423  0.000302  0.131828  0.000304  0.000311
0.000210
     5:            1         256         512           1           1
0.014104  0.000167  0.007073  0.000075  0.003572  0.000076  0.000076
0.000073
     6:          256           1         512           1           1
0.014087  0.000081  0.007069  0.000077  0.003570  0.000093  0.000077
0.000073
     7:          256         256         512           1           1
3.447902  0.001038  1.724391  0.001039  0.862664  0.001041  0.001586
0.000957
     8:            1         512         512           1           1
0.027301  0.000061  0.013667  0.000046  0.006857  0.000045  0.000050
0.000044
     9:          512           1         512           1           1
0.027314  0.000125  0.013669  0.000047  0.006855  0.000045  0.000049
0.000044
    10:          512         512         512           1           1
13.789054  0.003796  6.896283  0.003800  3.449923  0.003794  0.006229
0.003898
    31.298553 secs total

#!/usr/bin/env python

# nvidia_example.py -

import sys
import os
import time
import numpy

import pycuda.autoinit
import pycuda.driver as cuda
from   pycuda.compiler import SourceModule


gridDim_x  = 1
gridDim_y  = 1

blockDim_x = 1
blockDim_y = 1
blockDim_z = 1

gridBlockList = [
    (1,  1,   1,1,1), (  1,1, 512,1,1),
    (1,100, 512,1,1), (100,1, 512,1,1), (100,100, 512,1,1),
    (1,256, 512,1,1), (256,1, 512,1,1), (256,256, 512,1,1),
    (1,512, 512,1,1), (512,1, 512,1,1), (512,512, 512,1,1) ]

#
==================================================================================================

cuda.init()
device = pycuda.tools.get_default_device()
print "Using device", device.name()

dev_dataRecords = cuda.mem_alloc( 1024 * 15 )

#
----------------------------------------------------------------------------------------------

krnl = SourceModule("""
__global__ void worker_0 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];
    int i;

    if ( threadIdx.x == 0 )
    {                                            // Case A:
        for( i=0; i<sizeof(dst); ++i )            // Count = sizeof(dst)
            dst[ i ] = 0;                        // Type = indexed by i

        dst[ 0 ] = dst[ 1 ];                    // (Gag "set but never used"
warning message from compiler)
    };
}

__global__ void worker_1 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];
    int i;

    if ( threadIdx.x == 0 )
    {                                            // Case B:
        for( i=0; i<sizeof(dst); ++i )            // Count = sizeof(dst)
            dst[ 0 ] = 0;                        // Type = always the same
element, 0

        dst[ 0 ] = dst[ 1 ];
    };
}

__global__ void worker_2 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];
    int i;

    if ( threadIdx.x == 0 )
    {                                            // Case C:
        for( i=0; i<(sizeof(dst)/2); ++i )        // Count = sizeof(dst)/2
            dst[ i ] = 0;                        // Type = indexed by i

        dst[ 0 ] = dst[ 1 ];
    };
}

__global__ void worker_3 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];
    int i;

    if ( threadIdx.x == 0 )
    {                                            // Case D:
        for( i=0; i<(sizeof(dst)/2); ++i )        // Count = sizeof(dst)/2
            dst[ 0 ] = 0;                        // Type = always the same
element, 0

        dst[ 0 ] = dst[ 1 ];
    };
}

__global__ void worker_4 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];
    int i;

    if ( threadIdx.x == 0 )
    {                                            // Case E:
        for( i=0; i<(sizeof(dst)/4); ++i )        // Count = sizeof(dst)/4
            dst[ i ] = 0;                        // Type = indexed by i

        dst[ 0 ] = dst[ 1 ];
    };
}

__global__ void worker_5 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];
    int i;

    if ( threadIdx.x == 0 )
    {                                            // Case F:
        for( i=0; i<(sizeof(dst)/4); ++i )        // Count = sizeof(dst)/4
            dst[ 0 ] = 0;                        // Type = always the same
element, 0

        dst[ 0 ] = dst[ 1 ];
    };
}

__global__ void worker_6 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];

    if ( threadIdx.x == 0 )
    {                                            // Case G:
        dst[ 0 ] = 0;                            // Count = 10
        dst[ 1 ] = 1;                            // Type = different
elements
        dst[ 2 ] = 2;
        dst[ 3 ] = 3;
        dst[ 4 ] = 4;
        dst[ 5 ] = 5;
        dst[ 6 ] = 6;
        dst[ 7 ] = 7;
        dst[ 8 ] = 8;
        dst[ 9 ] = 9;

        dst[ 0 ] = dst[ 1 ];
    };
}

__global__ void worker_7 ( char * src )
{
    __shared__ char dst[ (1024 * 15) ];

    if ( threadIdx.x == 0 )
    {                                            // Case H:
        dst[ 0 ] = 0;                            // Count = 10
        dst[ 0 ] = 1;                            // Type = always same
element, 0
        dst[ 0 ] = 2;
        dst[ 0 ] = 3;
        dst[ 0 ] = 4;
        dst[ 0 ] = 5;
        dst[ 0 ] = 6;
        dst[ 0 ] = 7;
        dst[ 0 ] = 8;
        dst[ 0 ] = 9;

        dst[ 0 ] = dst[ 1 ];
    };
}
""")

worker_0 = krnl.get_function("worker_0")
worker_1 = krnl.get_function("worker_1")
worker_2 = krnl.get_function("worker_2")
worker_3 = krnl.get_function("worker_3")
worker_4 = krnl.get_function("worker_4")
worker_5 = krnl.get_function("worker_5")
worker_6 = krnl.get_function("worker_6")
worker_7 = krnl.get_function("worker_7")

#
==================================================================================================

print "       gridDim_x   gridDim_y  blockDim_x  blockDim_y  blockDim_z",
print "%9s" % "A",
print "%9s" % "B",
print "%9s" % "C",
print "%9s" % "D",
print "%9s" % "E",
print "%9s" % "F",
print "%9s" % "G",
print "%9s" % "H"

timewas0 = time.time()
n = 0
for (gridDim_x, gridDim_y, blockDim_x, blockDim_y, blockDim_z) in
gridBlockList:
    print "%2u:" % n, ("%12u" * 5) % (gridDim_x, gridDim_y, blockDim_x,
blockDim_y, blockDim_z),
    sys.stdout.flush()
    for worker in [ worker_0, worker_1, worker_2, worker_3, worker_4,
worker_5, worker_6, worker_7 ]:
        timewas1 = time.time()
        worker( dev_dataRecords, grid=(gridDim_x, gridDim_y),
block=(blockDim_x,blockDim_y,blockDim_z) )
        pycuda.driver.Context.synchronize()
        print "%9.6f" % (time.time() - timewas1),
    print
    n += 1
print "%1.6f secs total" % (time.time() - timewas0)

_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net

[PyCUDA] pycuda.driver.Context.synchronize() delay time is a function of the count and kind of sram accesses?

Reply via email to