
I'm trying to use multiple gpus with mpi and ipc handles instead of the
built-in mpi primitives to p2p communication.

I think I'm not quite understanding how contexts should be managed.  For
example, I have two versions of a toy example to try out accessing data
between nodes via ipc handle.  Both seem to work, in the sense that process
1 can 'see' the data from process 0, but the first version completes
without any error, while the second version generates the following error:

PyCUDA WARNING: a clean-up operation failed (dead context maybe?)

cuMemFree failed: invalid value

The two versions are attached below.  Would appreciate any insight as to
what I'm doing wrong.


Here are the two versions:


from mpi4py import MPIimport numpy as npimport atexitimport
pycuda.driver as drvimport pycuda.gpuarray as gpuarrayclass
TestMGPU(object):    def __init__(self):        self.mpi_size =
MPI.COMM_WORLD.size        self.mpi_rank = MPI.COMM_WORLD.rank    def
proc(self):        if self.mpi_rank == 0:            ctx =
drv.Device(self.mpi_rank).make_context()            self.x_gpu =
gpuarray.to_gpu(np.random.rand(8))            h =
MPI.COMM_WORLD.send((h, self.x_gpu.shape, self.x_gpu.dtype), dest=1)
         print 'p1 self.x_gpu:', self.x_gpu            ctx.detach()
    else:            ctx = drv.Device(self.mpi_rank).make_context()
        h, s, d = MPI.COMM_WORLD.recv(source=0)            ptr =
drv.IPCMemoryHandle(h)            xt_gpu = gpuarray.GPUArray(s, d,
gpudata=ptr)            print 'xt_gpu: ', xt_gpu
ctx.detach()if __name__ == '__main__':    drv.init()
atexit.register(MPI.Finalize)    a = TestMGPU()    a.proc()

*VERSION 2  (Imports are the same)*

class TestMGPU(object):    def __init__(self):        self.mpi_size =
MPI.COMM_WORLD.size        self.mpi_rank = MPI.COMM_WORLD.rank
self.x_gpu = gpuarray.to_gpu(np.random.rand(8))    def proc(self):
   if self.mpi_rank == 0:            h =
MPI.COMM_WORLD.send((h, self.x_gpu.shape, self.x_gpu.dtype), dest=1)
         print 'p1 self.x_gpu:', self.x_gpu        else:            h,
s, d = MPI.COMM_WORLD.recv(source=0)            ptr =
drv.IPCMemoryHandle(h)            xt_gpu = gpuarray.GPUArray(s, d,
gpudata=ptr)            print 'xt_gpu: ', xt_gpuif __name__ ==
'__main__':    drv.init()    ctx =
atexit.register(ctx.pop)    atexit.register(MPI.Finalize)    a =
TestMGPU()    a.proc()
