Hi,

I'm trying to run the following code on my Mac laptop to multiply a 1D vector 
by a square matrix and then dot the result with another vector. It works fine 
when the dimension of the matrix and vector (DIM) is small but when at test 
values close the operating level (DIM ~ 1500 - 2000), it computes for a few 
seconds and then crashes the laptop with an unresponsive keyboard and screen. 

I've run the wiki-examples MatrixmulTIled.py with the same dimensions and that 
works fine. I would be grateful if someone could point out what I am doing 
wrong.

        Cheers,

        Matthew

import numpy as np
from pycuda import driver, compiler, gpuarray, tools
from pycuda.curandom import rand as curand
import pycuda.autoinit
import time

DIM = 1600
BLOCK_SIZE = 16 # 512 max threads per mp
gi = np.random.randn(DIM).astype(np.float32)
gj = np.random.randn(DIM).astype(np.float32)
dissim = np.random.randn(DIM, DIM).astype(np.float32)

ans = gpuarray.empty((DIM,), np.float32)

"""
Each thread calculates for one term (axis)
"""
kernel_code_template = '''
__global__ void GOPKernel(float *gi, float *gj, float *d, float *ans)
{

 // Element
 int row = blockIdx.y * blockDim.y + threadIdx.y;
 float val = 0;

 for (int k = 0; k < %(DIM)s; ++k) {
   float d_elem = d[row * %(DIM)s + k];
   float gi_elem = gi[k];
   float gj_elem = gj[k];
   val += 0.5 * gi[row] * d_elem * gi_elem; 
   val += 0.5 * gj[row] * d_elem * gj_elem;
   val -= gi[row] * d_elem * gj_elem;
 }

 ans[row] = val;
}
'''

# Get the kernel code from the template
kernel_code = kernel_code_template % {
 'DIM': DIM
 }

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get the kernel function from the compiled module
gopker = mod.get_function("GOPKernel")
gi = gpuarray.to_gpu(gi)
gj = gpuarray.to_gpu(gj)
dissim = gpuarray.to_gpu(dissim)

gridx = DIM / BLOCK_SIZE if DIM / BLOCK_SIZE == 1 else DIM / BLOCK_SIZE + 1
gridy = DIM / BLOCK_SIZE if DIM / BLOCK_SIZE == 1 else DIM / BLOCK_SIZE + 1

# Call the function on the card
gopker(
 # inputs
 gi, gj, dissim,
 # output
 ans,
 # block of multiple threads
 block = (BLOCK_SIZE, BLOCK_SIZE, 1),
 # grid of blocks
 grid = (gridx, gridy)
 )

# Get result
z = ans.get()


_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to