Hello,
I have another problem with my program.
I want to do a loop inside the kernel(in the loop I do sclar
multiplication of two vectors), but I don't know how. I created a
__device__ function with scalar multiplication(using threads) and
__global__ function with only loop invoking __device__ function. It
works, but for some reason I get different results each time. And
instead of different numbers in the result array, I get the array filled
with same numbers. I think, it appears because memory hasn't been cleaned.
But function free() invokes an error:
error: calling a host function from a __device__/__global__ function
is not allowed
Honestly speaking, I've tried everything I can imagine, but failed,
maybe you can help me=)
Best regards,
Mikhail Korobko
#Import pycuda
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.tools as tools
import pycuda.gpuarray as gpuarray
#Import math
import matplotlib
matplotlib.use("Agg")
import numpy
from pylab import *
from time import clock
cr1 = clock()
#define number of treads and blocks
thread_strides = 256
block_size = 1*32
total_size = thread_strides*block_size
dtype = numpy.float32
##########################
### Create arrays ###
##########################
N = total_size
f = logspace(log(10.0),log(1570.0), N, base = e).astype(numpy.float32)
mas = logspace(log(1.0),log(1000.0), 100,base = e).astype(numpy.float32)
result = zeros(N,dtype = numpy.float32)
koeff = zeros(N,dtype = numpy.float32)
koeff[0] = 1
koeff[1] = 4
fi = 2
while fi < N-2:
koeff[fi] = 2.0
fi+=1
koeff[fi] = 4.0
fi+=1
koeff[fi] = 1
koeff *= f**(-4.0/3.0)
koeff *= log(f[fi]/f[0])/(3.0*fi)
koeff = numpy.array(koeff).astype(numpy.float32)
cr = clock()
##########################
### Arrays to GPU ####
##########################
a_gpu = gpuarray.to_gpu(koeff)
b_gpu = gpuarray.to_gpu(result)
f_gpu = gpuarray.to_gpu(f)
mas_gpu = gpuarray.to_gpu(mas)
dest = cuda.mem_alloc(32*total_size*100*32)
res = cuda.mem_alloc(32*total_size*100*32)
W = cuda.mem_alloc(32*total_size)
S_h_quant = cuda.mem_alloc(32*total_size)
##########################
### C++ code ###
##########################
src = SourceModule("""
#include <pycuda-complex.hpp>
#define BLOCK_SIZE 256
#define _USE_MATH_DEFINES
#include <math.h>
#include <stdio.h>
#define M (1.0)
#define H_BAR (1.0)
#define L (4000.0)
#define G (2*M_PI*100)
#define W(i) (2.0*M_PI*f[i])
#define REDUCE(b, c) (b + c)
__device__ float S_h_quant(int i, float *f,float mas)
{return mas*((4*H_BAR/(M*L*L*W(i)*W(i)))*((2.0*pow(G,4))/(W(i)*W(i)*(G*G + W(i)*W(i))) + (W(i)*W(i)*(G*G + W(i)*W(i)))/(2.0*pow(G,4))));
}
__device__ float read_and_map(int i, float *a, float *be, float *f,float mas)
{return a[i]/(be[i] + S_h_quant(i,f,mas));
}
__device__ float loop(float *dest, float *a, float *be, float *f,float *W,float *S_h_quant,float mas)
{__shared__ float redResult[BLOCK_SIZE];
unsigned int n = 1*256*32;
unsigned int tid = threadIdx.x;
unsigned int seq_count = n*BLOCK_SIZE*tid + n - 1;
unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid;
float r_a_m;
float acc;
for (unsigned s = 0; s < seq_count; ++s)
{
if (i >= n)
break;
r_a_m = read_and_map(i,a,be,f,mas);
acc = REDUCE(acc, r_a_m);
i += BLOCK_SIZE;
}
redResult[tid] = acc;
__syncthreads();
#if (BLOCK_SIZE >= 512)
if (tid < 256) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 256]); }
__syncthreads();
#endif
#if (BLOCK_SIZE >= 256)
if (tid < 128) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 128]); }
__syncthreads();
#endif
#if (BLOCK_SIZE >= 128)
if (tid < 64) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 64]); }
__syncthreads();
#endif
if (tid < 32)
{
if (BLOCK_SIZE >= 64) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 32]);
if (BLOCK_SIZE >= 32) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 16]);
if (BLOCK_SIZE >= 16) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 8]);
if (BLOCK_SIZE >= 8) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 4]);
if (BLOCK_SIZE >= 4) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 2]);
if (BLOCK_SIZE >= 2) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 1]);
}
if (tid == 0) dest[blockIdx.x] = redResult[0];
//free(redResult);
return dest[0];
}
__global__ void adding(float *res, float *dest, float *a, float *be, float *f,float *W,float *S_h_quant,float *cmas )
{
for (int k=0;k<100; ++k)
{float mas = cmas[k];
res[k] = loop(dest,a,be,f,W,S_h_quant,mas);
//free(des);
}
}
""")
############################
### Function call ###
############################
func = src.get_function("adding")
func(res,dest,a_gpu,b_gpu,f_gpu,W,S_h_quant,mas_gpu, block = (thread_strides, 1, 1), grid = (block_size,1))
c = cuda.from_device(res, 100, numpy.float32)
print c
print clock()-cr
print clock()-cr1
_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda