> Looking at your streams.py code, I'm wondering why you're expecting
> things to run in parallel if your synchronizing with both stream1 and
> stream2 after you're done with each of them? Wouldn't that explicitly
> prevent any parallelism between them?
>
> What am I missing?
>
> Andreas

The syncronization was made to run the two streams as follows:
[start new task 1 : sync 2 : start new task 2 : sync 1] repeat

Notice that I did not sync with the newly started task but with the
previous one.


To make things clearer I attach two short tests where one runs with
overlapping mem-copy and exec but the other doesn't.
Easiest seen in the compute visual profiler (turn off most of the data
collection so it only runs once, otherwise it runs 12 times).

  notWorking:
stream 1: put
stream 1: exec
stream 1: get
stream 2: put
stream 2: exec
stream 2: get


  Working:
stream 1: put
stream 1: exec
stream 2: put
stream 1: get
stream 2: exec
stream 2: get


So it seems that CUDA does not work the way I was expecting it to. Can
someone explain why CUDA does not start the stream 2 put operation
before the stream 1 get operation in the notWorking.py?

-Magnus

ps CUDA 3.2 ... don't know what CUDA 4.0 does.



-----------------------------------------------
Magnus Paulsson
Assistant Professor
School of Computer Science, Physics and Mathematics
Linnaeus University
Phone: +46-480-446308
Mobile: +46-70-6942987
import pyfft.cuda as pyfft
import numpy.fft as cpufft
import pycuda.driver as cuda
import pycuda.autoinit, time
import pycuda.gpuarray as gpuarray
import numpy as N

typeC = N.complex128
typeR = N.float64

NN  = 1024*16*32*4
NN2 = 1024/32

# Two streams
stream1, stream2 = cuda.Stream(), cuda.Stream()

# Two fft plans for the streams
Plan1 = pyfft.Plan((NN,),dtype=typeC,stream=stream1,\
                       wait_for_finish=False, fast_math=True)
Plan2 = pyfft.Plan((NN,),dtype=typeC,stream=stream2,\
                       wait_for_finish=False, fast_math=True)

# Two page locked numpy arrays to send data to device
a1=N.array(N.random.rand(NN),typeC)
a1pl = cuda.pagelocked_empty_like(a1)
a1pl[:] = a1[:]
a2=N.array(N.random.rand(NN),typeC)
a2pl = cuda.pagelocked_empty_like(a2)
a2pl[:] = a2[:]

startTime = time.time()

# Get page-locked numpy array for device to cpu copy
fft1 = cuda.pagelocked_empty_like(a1)
fft2 = cuda.pagelocked_empty_like(a2)

a1_g = gpuarray.to_gpu_async(a1pl,stream=stream1)
print "1 to gpu",time.time()-startTime
Plan1.execute(a1_g,wait_for_finish=False)
print "1 exec",time.time()-startTime
a2_g = gpuarray.to_gpu_async(a2pl,stream=stream2)
print "2 to gpu",time.time()-startTime
a1_g.get_async(ary=fft1, stream=stream1)
print "1 get",time.time()-startTime
Plan2.execute(a2_g,wait_for_finish=False)
print "2 exec",time.time()-startTime
a2_g.get_async(ary=fft2, stream=stream2)
print "2 get",time.time()-startTime

stream1.synchronize()
print "1 sync",time.time()-startTime
stream2.synchronize()
print "2 sync",time.time()-startTime

print "Time: ",time.time()-startTime

del Plan1, Plan2, stream1, stream2
pycuda.tools.clear_context_caches()
import pyfft.cuda as pyfft
import numpy.fft as cpufft
import pycuda.driver as cuda
import pycuda.autoinit, time
import pycuda.gpuarray as gpuarray
import numpy as N

typeC = N.complex128
typeR = N.float64

NN  = 1024*16*32*4
NN2 = 1024/32

# Two streams
stream1, stream2 = cuda.Stream(), cuda.Stream()

# Two fft plans for the streams
Plan1 = pyfft.Plan((NN,),dtype=typeC,stream=stream1,\
                       wait_for_finish=False, fast_math=True)
Plan2 = pyfft.Plan((NN,),dtype=typeC,stream=stream2,\
                       wait_for_finish=False, fast_math=True)

# Two page locked numpy arrays to send data to device
a1=N.array(N.random.rand(NN),typeC)
a1pl = cuda.pagelocked_empty_like(a1)
a1pl[:] = a1[:]
a2=N.array(N.random.rand(NN),typeC)
a2pl = cuda.pagelocked_empty_like(a2)
a2pl[:] = a2[:]

startTime = time.time()

# Get page-locked numpy array for device to cpu copy
fft1 = cuda.pagelocked_empty_like(a1)
fft2 = cuda.pagelocked_empty_like(a2)

a1_g = gpuarray.to_gpu_async(a1pl,stream=stream1)
print "1 to gpu",time.time()-startTime
Plan1.execute(a1_g,wait_for_finish=False)
print "1 exec",time.time()-startTime
a1_g.get_async(ary=fft1, stream=stream1)
print "1 get",time.time()-startTime
a2_g = gpuarray.to_gpu_async(a2pl,stream=stream2)
print "2 to gpu",time.time()-startTime
Plan2.execute(a2_g,wait_for_finish=False)
print "2 exec",time.time()-startTime
a2_g.get_async(ary=fft2, stream=stream2)
print "2 get",time.time()-startTime

stream1.synchronize()
print "1 sync",time.time()-startTime
stream2.synchronize()
print "2 sync",time.time()-startTime

print "Time: ",time.time()-startTime

del Plan1, Plan2, stream1, stream2
pycuda.tools.clear_context_caches()
_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to