Hi everyone,
This has been causing me problems for a few weeks now, and I'm hoping
someone would be able to shed some light on it. I need to run some
CPU-intensive tasks in the background while launching GPU kernels in the
main loop of a project I'm working on, so I've been trying to offload to
a multiprocessing process. But it seems what whenever I try to launch a
kernel while the background process is active, the kernel fails to get
the correct results (does not throw any errors). Once the kernel
returns wrong results once, it continues to fail for the remainder of
the run, even if the background process has already finished and joined.
I've put together a small code sample to demonstrate this [attached].
Is this known behaviour, and if so, is there any workaround I can use?
Or am I doing something completely wrong?
Thanks,
Brendan Wood
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import hashlib
import multiprocessing
import time
n = 1 << 24 # 16M elements
blocksize = 512
cuda_source = """
#include <stdio.h>
#include <stdint.h>
#include <cuda.h>
#define BLOCKSIZE %d
__global__ void testkernel(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t n) {
int tid = blockIdx.x * BLOCKSIZE + threadIdx.x;
c[tid] = a[tid] * b[tid];
}
""" % blocksize
# background process that just sleeps
class BGProcess(multiprocessing.Process):
def __init__(self):
multiprocessing.Process.__init__(self)
def run(self):
time.sleep(2)
def run_kernel(a, b, c, a_d, b_d, c_d, n, kernel):
c[:] = 0
cuda.memcpy_htod(c_d, c)
cuda.memcpy_htod(a_d, a)
cuda.memcpy_htod(b_d, b)
# element-wise product: c = a * b
kernel(
a_d,
b_d,
c_d,
np.uint32(n),
block=(blocksize, 1, 1),
grid=(n/blocksize, 1),
)
cuda.memcpy_dtoh(c, c_d)
# return a hash for comparison
return hashlib.md5(c).hexdigest()
def main():
mod = SourceModule(cuda_source, arch='sm_20')
kernel = mod.get_function('testkernel')
a = cuda.pagelocked_zeros((n,), dtype=np.uint32)
b = cuda.pagelocked_zeros((n,), dtype=np.uint32)
c = cuda.pagelocked_zeros((n,), dtype=np.uint32)
c_ref = cuda.pagelocked_zeros((n,), dtype=np.uint32)
a_d = cuda.mem_alloc(n * np.uint32().nbytes)
b_d = cuda.mem_alloc(n * np.uint32().nbytes)
c_d = cuda.mem_alloc(n * np.uint32().nbytes)
a[:] = 2
b[:] = 3
# reference calculation and hash
c_ref = a * b
c_ref_hash = hashlib.md5(c_ref).hexdigest()
# test kernel to show it works
c_hash_1 = run_kernel(a, b, c, a_d, b_d, c_d, n, kernel)
# test kernel with multiprocessing process in background
bg = BGProcess()
bg.start()
c_hash_2 = run_kernel(a, b, c, a_d, b_d, c_d, n, kernel)
bg.join()
# test kernel against after background process is finished
c_hash_3 = run_kernel(a, b, c, a_d, b_d, c_d, n, kernel)
# print results
print c_ref_hash, '- reference hash for correct results'
print c_hash_1, '- hash for first test (vanilla)'
print c_hash_2, '- hash for second test (multiprocessing process in background)'
print c_hash_3, '- hash for third test (background process finished)'
if __name__ == '__main__':
main()
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda