I've been trying to use PyCUDA to pass data to and from a GPU for
processing with the device functions in the free version of the CULA
toolkit [1]. Based upon past postings to the list and the ctypes
approach used in parret [2], I gather that this is possible with CUDA
3.0. However, it seems that the CULA device functions I invoke are
having no effect upon the data copied over to the CPU. (See attached
code.)

Has anyone managed to do the above successfully? I'm using PyCUDA
0.94rc with CUDA 3.0 on 64-bit Linux.

                                                Thanks,
                                                L.G.

[1] http://www.culatools.com
[2] http://www.mathcs.emory.edu/~yfan/PARRET/
#!/usr/bin/env python

from pprint import pprint
import pycuda.driver as drv
import pycuda.compiler as comp
import pycuda.tools as tools
import numpy as np
import atexit
import ctypes

# Initialize CUDA:
drv.init()
context = tools.make_default_context()
device = context.get_device()
atexit.register(context.detach)

# Load CULA library:
_libcula = ctypes.cdll.LoadLibrary('libcula.so')
_libcula.culaInitialize.restype = int
_libcula.culaInitialize()

# Release resources when shutting down:
_libcula.culaShutdown.restype = int
atexit.register(_libcula.culaShutdown)

# Set up data:
a = np.array([[1, 2], [3, 4]], np.float32)

# Transfer data to the device:
a_gpu = drv.mem_alloc(a.nbytes)
drv.memcpy_htod(a_gpu, a.copy().T)

# Set up ctypes interface to culaDeviceSgesvd():
culaDeviceSgesvd = _libcula.culaDeviceSgesvd
culaDeviceSgesvd.restype = ctypes.c_int
culaDeviceSgesvd.argtypes = [ctypes.c_char,
                             ctypes.c_char,
                             ctypes.c_int,
                             ctypes.c_int,
                             ctypes.POINTER(ctypes.c_float),
                             ctypes.c_int,
                             ctypes.POINTER(ctypes.c_float),
                             ctypes.POINTER(ctypes.c_float),
                             ctypes.c_int,
                             ctypes.POINTER(ctypes.c_float),
                             ctypes.c_int]

jobu = 'A'
jobvt = 'A'
m, n = a.shape
lda = max(1, m)
ldvt = n
ldu = m

# Allocate GPU memory for results:
u = np.zeros((ldu, m), np.float32)
s = np.zeros(min(m, n), np.float32)
v = np.zeros((n, n), np.float32)

u_gpu = drv.mem_alloc(u.nbytes)
s_gpu = drv.mem_alloc(s.nbytes)
v_gpu = drv.mem_alloc(v.nbytes)

drv.memcpy_htod(u_gpu, u)
drv.memcpy_htod(s_gpu, s)
drv.memcpy_htod(v_gpu, v)

# Run function:
make_float_ptr = lambda x : \
                 ctypes.cast(int(x), ctypes.POINTER(ctypes.c_float))
    
culaDeviceSgesvd(jobu, jobvt, m, n, make_float_ptr(a_gpu), lda,
                 make_float_ptr(s_gpu), make_float_ptr(u_gpu),
                 ldu, make_float_ptr(v_gpu), ldvt)

drv.memcpy_dtoh(u, u_gpu)
drv.memcpy_dtoh(s, s_gpu)
drv.memcpy_dtoh(v, v_gpu)

print 'numpy svd: '
pprint(np.linalg.svd(a))

print 'cula svd: '
pprint((u, s, v.T))


_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to