Hello,

I have another problem with my program.
I want to do a loop inside the kernel(in the loop I do sclar multiplication of two vectors), but I don't know how. I created a __device__ function with scalar multiplication(using threads) and __global__ function with only loop invoking __device__ function. It works, but for some reason I get different results each time. And instead of different numbers in the result array, I get the array filled with same numbers. I think, it appears because memory hasn't been cleaned.
But function free() invokes an error:

   error: calling a host function from a __device__/__global__ function
   is not allowed

Honestly speaking, I've tried everything I can imagine, but failed, maybe you can help me=)

Best regards,
         Mikhail Korobko
#Import pycuda
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.tools as tools
import pycuda.gpuarray as gpuarray
#Import math
import matplotlib
matplotlib.use("Agg")
import numpy
from pylab import *
from time import clock

cr1 = clock()
#define number of treads and blocks
thread_strides = 256
block_size = 1*32
total_size = thread_strides*block_size
dtype = numpy.float32

##########################
###   Create arrays    ###
##########################
N = total_size

f = logspace(log(10.0),log(1570.0), N, base = e).astype(numpy.float32)
mas = logspace(log(1.0),log(1000.0), 100,base = e).astype(numpy.float32)

result = zeros(N,dtype = numpy.float32)

koeff = zeros(N,dtype = numpy.float32)
koeff[0] = 1
koeff[1] = 4
fi = 2

while fi < N-2:
	koeff[fi] = 2.0
	fi+=1
	koeff[fi] = 4.0
	fi+=1
	
koeff[fi] = 1
koeff *= f**(-4.0/3.0)
koeff *= log(f[fi]/f[0])/(3.0*fi)
koeff = numpy.array(koeff).astype(numpy.float32)

cr = clock()
##########################
###   Arrays to GPU   ####
##########################
a_gpu = gpuarray.to_gpu(koeff)
b_gpu = gpuarray.to_gpu(result)
f_gpu = gpuarray.to_gpu(f)
mas_gpu = gpuarray.to_gpu(mas)

dest = cuda.mem_alloc(32*total_size*100*32)
res = cuda.mem_alloc(32*total_size*100*32)
W = cuda.mem_alloc(32*total_size)
S_h_quant = cuda.mem_alloc(32*total_size)
##########################
###   C++ code         ###
##########################
src = SourceModule("""
#include <pycuda-complex.hpp>
#define BLOCK_SIZE 256
#define _USE_MATH_DEFINES
#include <math.h>
#include <stdio.h>

#define M (1.0)
#define H_BAR (1.0)
#define L (4000.0)
#define G  (2*M_PI*100)

#define W(i) (2.0*M_PI*f[i]) 
#define REDUCE(b, c) (b + c)


__device__ float S_h_quant(int i,  float *f,float mas)
	{return mas*((4*H_BAR/(M*L*L*W(i)*W(i)))*((2.0*pow(G,4))/(W(i)*W(i)*(G*G + W(i)*W(i)))  + (W(i)*W(i)*(G*G + W(i)*W(i)))/(2.0*pow(G,4))));
	}

__device__ float read_and_map(int i, float *a, float *be, float *f,float mas)
	{return a[i]/(be[i] + S_h_quant(i,f,mas));
	}
__device__ float loop(float *dest, float *a, float *be, float *f,float *W,float *S_h_quant,float mas)
	{__shared__ float redResult[BLOCK_SIZE];
	
	unsigned int n = 1*256*32;
	unsigned int tid = threadIdx.x;
	unsigned int seq_count = n*BLOCK_SIZE*tid + n - 1;
	unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid;
	float r_a_m;
	float acc;
	for (unsigned s = 0; s < seq_count; ++s)
		{ 
		if (i >= n)
			break;
		r_a_m = read_and_map(i,a,be,f,mas);
		acc = REDUCE(acc, r_a_m); 
  
		i += BLOCK_SIZE; 
		}	
	
		redResult[tid] = acc;
          
		__syncthreads();
          
		#if (BLOCK_SIZE >= 512) 
		  if (tid < 256) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 256]); }
		  __syncthreads();
		#endif
          
		#if (BLOCK_SIZE >= 256) 
		  if (tid < 128) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 128]); } 
		  __syncthreads(); 
		  #endif
          
		#if (BLOCK_SIZE >= 128) 
		  if (tid < 64) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 64]); } 
		  __syncthreads(); 
		  #endif
          
		if (tid < 32) 
		{
		  if (BLOCK_SIZE >= 64) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 32]);
		  if (BLOCK_SIZE >= 32) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 16]);
		  if (BLOCK_SIZE >= 16) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 8]);
		  if (BLOCK_SIZE >= 8)  redResult[tid] = REDUCE(redResult[tid], redResult[tid + 4]);
		  if (BLOCK_SIZE >= 4)  redResult[tid] = REDUCE(redResult[tid], redResult[tid + 2]);
		  if (BLOCK_SIZE >= 2)  redResult[tid] = REDUCE(redResult[tid], redResult[tid + 1]);
		}
          
		if (tid == 0) dest[blockIdx.x] = redResult[0];
		//free(redResult);
		return dest[0];
                
	}
__global__ void adding(float *res, float *dest, float *a, float *be, float *f,float *W,float *S_h_quant,float *cmas )
    {   
	for (int k=0;k<100; ++k)
		{float mas = cmas[k];
		res[k] = loop(dest,a,be,f,W,S_h_quant,mas);
		//free(des);
		}
    }

""")
############################
###   Function call      ###
############################

func = src.get_function("adding")
func(res,dest,a_gpu,b_gpu,f_gpu,W,S_h_quant,mas_gpu, block = (thread_strides, 1, 1), grid = (block_size,1))
c = cuda.from_device(res, 100, numpy.float32)


print c
print clock()-cr
print clock()-cr1


_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to